In [9]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)

shuffled_index = np.random.permutation(admissions.index)
shuffled_admissions = admissions.loc[shuffled_index]
admissions = shuffled_admissions.reset_index()
admissions.ix[0:128, "fold"] = 1
admissions.ix[129:257, "fold"] = 2
admissions.ix[258:386, "fold"] = 3
admissions.ix[387:514, "fold"] = 4
admissions.ix[515:644, "fold"] = 5
# Ensure the column is set to integer type.
admissions["fold"] = admissions["fold"].astype('int')

print(admissions.head())
print(admissions.tail())

   index       gpa         gre  actual_label  fold
0    117  3.219669  483.761856             0     1
1    369  3.292225  587.697669             0     1
2    403  2.902209  664.938813             1     1
3    323  3.234734  586.992382             0     1
4    225  2.844566  703.043462             0     1
     index       gpa         gre  actual_label  fold
639    515  3.605576  570.617519             1     5
640    174  3.120778  593.747164             0     5
641    178  3.073865  483.030549             0     5
642     73  2.755856  693.451548             0     5
643    604  3.032122  764.790105             1     5


In [2]:
from sklearn.linear_model import LogisticRegression
# Training
model = LogisticRegression()
train_iteration_one = admissions[admissions["fold"] != 1]
test_iteration_one = admissions[admissions["fold"] == 1]
model.fit(train_iteration_one[["gpa"]], train_iteration_one["actual_label"])

# Predicting
labels = model.predict(test_iteration_one[["gpa"]])
test_iteration_one["predicted_label"] = labels

matches = test_iteration_one["predicted_label"] == test_iteration_one["actual_label"]
correct_predictions = test_iteration_one[matches]
iteration_one_accuracy = len(correct_predictions) / float(len(test_iteration_one))
print(iteration_one_accuracy)

0.5968992248062015


In [3]:
import numpy as np
fold_ids = [1,2,3,4,5]
def train_and_test(df, folds):
    fold_accuracies = []
    for fold in folds:
        model = LogisticRegression()
        train = admissions[admissions["fold"] != fold]
        test = admissions[admissions["fold"] == fold]
        model.fit(train[["gpa"]], train["actual_label"])
        labels = model.predict(test[["gpa"]])
        test["predicted_label"] = labels

        matches = test["predicted_label"] == test["actual_label"]
        correct_predictions = test[matches]
        fold_accuracies.append(len(correct_predictions) / float(len(test)))
    return(fold_accuracies)

accuracies = train_and_test(admissions, fold_ids)
print(accuracies)
average_accuracy = np.mean(accuracies)
print(average_accuracy)

[0.5968992248062015, 0.6201550387596899, 0.6356589147286822, 0.6015625, 0.7286821705426356]
0.6365915697674418


In [7]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)

kf = KFold(5, shuffle=True, random_state=8)
lr = LogisticRegression()
#roc_auc 
accuracies = cross_val_score(lr,admissions[["gpa"]], admissions["actual_label"], scoring="roc_auc", cv=kf.split(admissions))
average_accuracy = sum(accuracies) / len(accuracies)

print(accuracies)
print(average_accuracy)

[0.70790123 0.69550265 0.65987934 0.73363017 0.57864583]
0.6751118445238359


In [21]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)

lr = LogisticRegression()
#roc_auc 
accuracies = cross_val_score(lr,admissions[["gpa"]], admissions["actual_label"], scoring="roc_auc", cv=5)
average_accuracy = np.mean(accuracies)

print(accuracies)
print(average_accuracy)

[0.6505102  0.68852041 0.6372449  0.7127551  0.703125  ]
0.6784311224489796
