In [1]:
import pandas as pd
import numpy as np
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [46]:
# Import and one more rename
data = pd.read_csv("../data/UsdanRegistration2016-2018.csv", encoding="utf-8")

data = data.rename({'reg_STDT::student_serialnumber_t':'student_id', 'reg_stdt_PRNT::add_bill_county_t' : 'student_county'}, axis=1)
data.groupby("student_id").nunique()["year"]

data.query("year!=2018").index.values

array([1682, 1683, 1684, ..., 4594, 4595, 4596])

In [3]:
# Make the data categorical
data["Major"] = data["Major"].astype('category')
data["Minor1"] = data["Minor1"].astype('category')
data["Minor2"] = data["Minor2"].astype('category')
data["dept_t"] = data["dept_t"].astype('category')
data["student_county"] = data["student_county"].astype('category')
data["program_type"] = data["reg_PROGRAMS::program_t"].astype('category')
data["LargeAid"] = data["_TuitionAidTotal"]
data["LargeAid"] = data["LargeAid"].fillna(data["_ScholarshipTotal"])

In [4]:
# Make Numerical Categorical Info and uniq classes
cat_columns = data.select_dtypes(['category']).columns
for cat in cat_columns:
    data[cat+"_str"] = cat + ": " + data[cat].astype('unicode')
data[cat_columns] = data[cat_columns].apply(lambda x: x.cat.codes)

In [5]:
# Separate Scholarship and Full-Paying Students
non_schol = data[pd.isnull(data["LargeAid"])]
schol = data.dropna(axis=0, subset=["LargeAid"])
print non_schol.shape
print schol.shape

(3417, 33)
(1180, 33)


In [6]:
# Check if the student returned between 2017 and 2018
# This can be improved to give more data
recurrence = ((non_schol[non_schol["year"] != 2016].groupby("student_id").year.nunique() > 1)).to_frame()
recurrence = recurrence.rename({'year': 'returned'}, axis=1)
recurrence = non_schol[non_schol["year"] != 2016].join(recurrence, on="student_id")
recurrence = recurrence[recurrence["year"] == 2017]

In [17]:
# Turn Categorical Categories Into Boolean because it makes the vectors more logical mathematically
return_inter = recurrence[["Major_str", "Minor1_str", "dept_t_str", "Minor2_str", "program_type_str", "student_county_str", "returned"]]
major = pd.get_dummies(return_inter.Major_str)
minor1 = pd.get_dummies(return_inter.Minor1_str)
minor2 = pd.get_dummies(return_inter.Minor2_str)
minor2 = pd.get_dummies(return_inter.Minor2_str)
dept = pd.get_dummies(return_inter.dept_t_str)
program_type = pd.get_dummies(return_inter.program_type_str)
returned = return_inter["returned"].to_frame()
return_preds = returned.join([dept, program_type])

In [31]:
improvement = []
overfit = []
coefficients = []
# Janky K-Folds Cross Validation
# Randomly Picks Chunks of Data
# Then averages across all to account for variance
for i in range(10):
    msk = np.random.rand(len(return_preds)) < 0.8
    train = return_preds[msk]
    test = return_preds[~msk]
    y_train = train['returned'].values
    y_test = test['returned'].values
    X_train = train.drop(['returned'], axis=1).values
    X_test = test.drop(['returned'], axis=1).values

    # Naive
    naive_pred = [True]*len(y_test)
    naive_accuracy = round(metrics.accuracy_score(y_test, naive_pred) * 100, 2)
    naive_accuracy

    methods = [LogisticRegression(), LinearSVC(), SVC(), Perceptron()]
    for method in methods:
        improvement.append([])
        overfit.append([])
        coefficients.append([])
    for i, method in enumerate(methods):
        method.fit(X_train, y_train)
        if i in [0,1]:
            coeff_df = pd.DataFrame(train.columns.delete(train.columns.get_loc("returned")))
            coeff_df.columns = ['Feature']
            coeff_df["Correlation"] = pd.Series(method.coef_[0])
            coefficients[i].append(coeff_df.set_index("Feature"))
        Y_pred = method.predict(X_test)
        method_accuracy = round(metrics.accuracy_score(y_test, Y_pred) * 100, 2)
        acc_method = round(method.score(X_train, y_train) * 100, 2)
        improvement[i].append(method_accuracy)
        overfit[i].append(acc_method-method_accuracy)

In [32]:
# Print out the meaningful metrics where it counts
for i, method in enumerate(methods):
    np_improvement = np.array(improvement[i])
    np_overfit = np.array(overfit[i])
    print "\n"+str(method.__class__).split(".")[-1].split("'")[0]
    print "\tAverage Improvement:\t" + str(np.mean(np_improvement, axis=0))
    print "\tAverage Overfit:\t" + str(np.mean(np_overfit, axis=0))
    print "\tSTD Improvement:\t" + str(np.std(np_improvement, axis=0))
    print "\tSTD Overfit:\t\t" + str(np.std(np_overfit, axis=0))
    if i in [0,1]:
        print pd.concat(coefficients[i]).groupby(pd.concat(coefficients[i]).index).mean().sort_values(by="Correlation", ascending=False)


LogisticRegression
	Average Improvement:	60.509
	Average Overfit:	2.645
	STD Improvement:	2.22888963388
	STD Overfit:		3.05255712477
                                      Correlation
Feature                                          
program_type: 7 Week                     0.744326
dept_t: Nature                           0.710706
program_type: 7 Week - Premium           0.514585
dept_t: Theater                          0.416393
dept_t: Guitar                           0.365998
program_type: 7 Week - Split             0.244115
dept_t: Discovery                        0.217057
dept_t: Chess                            0.202080
dept_t: Dance                            0.145772
dept_t: Art                              0.106888
program_type: 4 Week                    -0.097271
program_type: 7 Week - Premium Split    -0.134965
dept_t: Piano                           -0.153090
program_type: 2 Week A                  -0.200389
program_type: 2 Week B                  -0.231070
dept_t: Chorus  

In [28]:
msk = np.random.rand(len(return_preds)) < 0.8
train = return_preds[msk]
test = return_preds[~msk]

(227, 23)

In [29]:
minor1

Unnamed: 0,Minor1: Actor’s Craft,Minor1: Archery,Minor1: Audition Boot Camp,Minor1: Chamber Music,Minor1: Chamber Music (clarinet),Minor1: Chamber Music (Bass),Minor1: Chamber Music (Cello),Minor1: Chamber Music (Clarinet),Minor1: Chamber Music (French Horn),Minor1: Chamber Music (Trombone),...,Minor1: Painting & Drawing,Minor1: Photography,Minor1: Pilobolus: Moving & Making,Minor1: Quidditch,Minor1: Senior Chorus,Minor1: Sustainable Art: Little Changes with Little Sun,Minor1: Tap Dance,Minor1: Tennis,Minor1: Yoga,Minor1: nan
1682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1686,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1688,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1689,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1691,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1692,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1694,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
