In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm

In [2]:
allStudents = pd.read_pickle("GeneratedData")

In [3]:
def categoricalVariableSplitter(df, colname):
    onehotencoder = OneHotEncoder()
    X = onehotencoder.fit_transform(df[colname].values.reshape(-1,1)).toarray()
    dfOneHot = pd.DataFrame(X, columns = [colname+ "_" +str(int(i)) for i in range(len(df[colname].unique()))]) 
    df = pd.concat([df, dfOneHot], axis=1)
    df = df.drop([colname], axis=1) 
    return df

#Makes different columns for each categorical variable for all of the categorical variables in the dataset

In [4]:
data = allStudents
columns = ["Gender", "Race"]
for col in columns:
    data = categoricalVariableSplitter(data, col)

In [5]:
data['Const'] = 1

In [6]:
data

Unnamed: 0,SES,GPA,SAT,Random Acceptance,Partial Logistic Acceptance,Complete Logistic Acceptance,Gender_0,Gender_1,Race_0,Race_1,Race_2,Race_3,Race_4,Const
0,497796,3.54,1884,0,1,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1
1,623523,3.11,1231,0,0,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
2,590253,3.3,772,1,0,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,640081,2.93,1190,0,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
4,769626,4.66,1651,0,0,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,634137,3.59,1319,1,0,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
49996,658583,3.07,773,0,0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
49997,391683,4.16,1260,0,1,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1
49998,722668,3.01,2113,0,0,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1


In [7]:
independent_variables = [
 'Const',
 'GPA',
 'SES',
 'SAT',
 'Gender_1',
 'Race_1',
 'Race_2',
 'Race_3',
 'Race_4'
]
X = data[independent_variables]
y1 = np.array(data["Random Acceptance"])
y1 = y1.astype('int')
y2 = np.array(data["Partial Logistic Acceptance"])
y2 = y2.astype('int')
y3 = np.array(data["Complete Logistic Acceptance"])
y3 = y3.astype('int')

In [8]:
X_train, X_test, y_train1, y_test1, y_train2, y_test2, y_train3, y_test3 = train_test_split(X,y1,y2,y3, test_size = .2, random_state=3)
#Splits the dataset into a test and training set -- 80% train and 20% test

Random Acceptance

In [9]:
log_reg = LogisticRegression(solver='lbfgs', max_iter = 10000)
log_reg.fit(X_train,y_train1)
y_pred = log_reg.predict(X_test)

In [10]:
cnf_matrix = metrics.confusion_matrix(y_test1, y_pred)
print(cnf_matrix)
confusion_key = [["Actual: 0 Predicted: 0", "Actual: 0, Predicted: 1"],
                 ["Actual: 1 Predicted: 0", "Actual: 1, Predicted: 1"]]
confusion_key =  np.reshape(confusion_key, (2,2))
print(confusion_key)

[[5965    0]
 [4035    0]]
[['Actual: 0 Predicted: 0' 'Actual: 0, Predicted: 1']
 ['Actual: 1 Predicted: 0' 'Actual: 1, Predicted: 1']]


In [11]:
print("Accuracy:",metrics.accuracy_score(y_test1, y_pred))
print("Precision:",metrics.precision_score(y_test1, y_pred))
print("Recall:",metrics.recall_score(y_test1, y_pred))

Accuracy: 0.5965
Precision: 0.0
Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


Partial Logistic Acceptance

In [12]:
log_reg = LogisticRegression(solver='lbfgs', max_iter = 10000)
log_reg.fit(X_train,y_train2)
y_pred = log_reg.predict(X_test)

In [13]:
cnf_matrix = metrics.confusion_matrix(y_test2, y_pred)
print(cnf_matrix)
confusion_key = [["Actual: 0 Predicted: 0", "Actual: 0, Predicted: 1"],
                 ["Actual: 1 Predicted: 0", "Actual: 1, Predicted: 1"]]
confusion_key =  np.reshape(confusion_key, (2,2))
print(confusion_key)

[[7325  223]
 [1597  855]]
[['Actual: 0 Predicted: 0' 'Actual: 0, Predicted: 1']
 ['Actual: 1 Predicted: 0' 'Actual: 1, Predicted: 1']]


In [14]:
print("Accuracy:",metrics.accuracy_score(y_test2, y_pred))
print("Precision:",metrics.precision_score(y_test2, y_pred))
print("Recall:",metrics.recall_score(y_test2, y_pred))

Accuracy: 0.818
Precision: 0.7931354359925789
Recall: 0.348694942903752


Complete Logistic Acceptance

In [15]:
log_reg = LogisticRegression(solver='lbfgs', max_iter = 10000)
log_reg.fit(X_train,y_train3)
y_pred = log_reg.predict(X_test)

In [16]:
cnf_matrix = metrics.confusion_matrix(y_test3, y_pred)
print(cnf_matrix)
confusion_key = [["Actual: 0 Predicted: 0", "Actual: 0, Predicted: 1"],
                 ["Actual: 1 Predicted: 0", "Actual: 1, Predicted: 1"]]
confusion_key =  np.reshape(confusion_key, (2,2))
print(confusion_key)

[[ 577 1735]
 [ 157 7531]]
[['Actual: 0 Predicted: 0' 'Actual: 0, Predicted: 1']
 ['Actual: 1 Predicted: 0' 'Actual: 1, Predicted: 1']]


In [17]:
print("Accuracy:",metrics.accuracy_score(y_test3, y_pred))
print("Precision:",metrics.precision_score(y_test3, y_pred))
print("Recall:",metrics.recall_score(y_test3, y_pred))

Accuracy: 0.8108
Precision: 0.812756313403842
Recall: 0.9795785639958376


In [24]:
# logit_model = sm.Logit(np.asarray(y_train3),X_train.astype(float))
# result  = logit_model.fit()
# print(result.summary())

In [19]:
X_train.to_pickle("Student Training Data")

In [20]:
X_test.to_pickle("Student Testing Data")

In [21]:
np.save('Random Acceptance Student Training Label', y_train1)
np.save('Random Acceptance Student Test Label', y_test1)

np.save('Partial Logistic Acceptance Student Training Label', y_train2)
np.save('Partial Logistic Acceptance Student Test Label', y_test2)

np.save('Complete Logistic Acceptance Student Training Label', y_train3)
np.save('Complete Logistic Acceptance Student Test Label', y_test3)