In [1]:
#General imports
import pandas as pd
import numpy as np
from collections import Counter
from math import radians, cos, sin, asin, sqrt
from datetime import date,datetime

#Imports for models 
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.svm import LinearSVC, SVC
from sklearn import svm, tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import graphviz

#Imports for scoring
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
import collections

#Imports for encoding the data

from sklearn.preprocessing import OneHotEncoder

In [2]:
# Globals.
# jjj
stages = ['Inquiry', 'App_Started', 'App_Submitted', 'App_Complete', 'Admit', 'Deposit', 'Pre-Enrolled', 'Paid', 'Enrolled']

In [3]:
# This dataset includes terms: Spring 2014,Fall 2014,Spring 2015,Fall 2015,Spring 2016,Fall 2016,Spring 2017,Fall 2017,Spring 2018, Fall 2018
def Load_data(Raw_data):
    df = pd.read_csv(Raw_data, encoding = "ISO-8859-1", low_memory = False)
    df.rename(columns = {'Applicant: SIS ID': 'SIS_ID',
                         'Application: ID': 'Application_ID',
                         'Application: Created Date': 'Application_Created_Date',
                         'Applicant: Contact ID': 'Contact_ID',
                         'Applicant: First Name': 'First_Name',
                         'Applicant: Last Name': 'Last_Name',
                         'Latitude (MapAnything)': 'Latitude',
                         'Longitude (MapAnything)': 'Longitude',
                         'Applicant: Gender': 'Gender',
                         'Applicant: # of non-inquiry Undergrad Apps': '#_of_non-inquiry_Undergrad_Apps',
                         'Term (HEDA)': 'Term',
                         'Applicant: Country of Citizenship': 'Country_of_Citizenship',
                         'Applicant: Canada Status': 'Canada_Status',
                         'Applicant: Birthdate': 'Birthdate',
                         'Applicant: Aboriginal Student': 'Aboriginal_Student',
                         'Applying for Financial Aid': 'Applying_for_Financial_Aid',
                         'Application: Application ID': 'Application_Number',
                         'AQ Candidacy ID': 'AQ_Candidacy_ID',
                         'Program Of Interest (HEDA)': 'Program_Of_Interest',
                         'Stream (Account LU)': 'Stream',
                         'National Student Status': 'National_Student_Status',
                         'Highest Stage Reached': 'Highest_Stage_Reached',
                         'Date Inquiry': 'Date_Inquiry',
                         'Date App Started': 'Date_App_Started',
                         'Date App Submitted': 'Date_App_Submitted',
                         'Date App Complete': 'Date_App_Complete',
                         'Date Admit': 'Date_Admit',
                         'Date Deposit': 'Date_Deposit',
                         'Date Pre-Enrolled': 'Date_Pre-Enrolled',
                         'Date Paid': 'Date_Paid',
                         'Date Enrolled': 'Date_Enrolled',
                         'Date File Closed': 'Date_File_Closed',
                         'Applicant: Source Code': 'Source_Code',
                         'Source Code Category': 'Source_Code_Category',
                         'Import Date': 'Import_Date',
                         'Admit GPA': 'Admit_GPA',
                         'Entrance Type': 'Entrance_Type',
                         'Admit Street Line 1': 'Admit_Street',
                         'Admit City': 'Admit_City',
                         'Admit State/Province': 'Admit_State/Province',
                         'Admit Country': 'Admit_Country',
                         'Current Region': 'Current_Region',
                         'File Forwarded Deferred Application': 'File_Forwarded_Deferred_Application',
                         'Closed File Disposition': 'Closed_File_Disposition',
                         'Candidate Decision': 'Candidate_Decision',
                         'School Decision': 'School_Decision'
                        }, inplace = True)
    return df

In [4]:
# Remove duplicate SIS IDs keeping the most recent application 
def Drop_dupes(df): 
    df.sort_values(by = ['Application_Created_Date'], ascending = False) # jjj
    df = df.drop_duplicates(subset = ['Contact_ID'], keep = 'first')
    return df

In [5]:
def Find_current_term(df):
    Year = df['Term'].str.split(' ')

In [6]:
# Create distance from campus field
def Haversine(row):
    # convert decimal degrees to radians
    lon1 = row['Longitude']
    lat1 = row['Latitude']
    lon2 = -122.6006468
    lat2 = 49.1409649    
    if pd.notnull(lon1) and pd.notnull(lat1):  
        lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
        # haversine formula 
        dlon = lon2 - lon1 
        dlat = lat2 - lat1 
        a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
        c = 2 * asin(sqrt(a)) 
        r = 6371 # Radius of earth in kilometers. Use 3956 for miles
        return c * r
    else:
        return -1

In [7]:
# Convert dates to datetime type
# jjj
def Dates_to_datetime(df):
    for date_field in (['Date_' + x for x in stages] + ['Birthdate', 'Application_Created_Date']):
        df[date_field] = pd.to_datetime(df[date_field], dayfirst = True, format = '%d/%m/%Y')
    return df

In [8]:
# Create fields for Days at "stage"
def Get_relevant_stages(stage, stages):
    first = stages.index(stage)
    if first == len(stages) - 1:
        return [None,None]
    else:
        return ['Date_' + stages[first], 'Date_' + stages[first + 1]]

def Calculate_days_at_stage(row, stage, stages, algo1):
    first, second = algo1(stage, stages)
    if pd.notnull(first) and pd.notnull(row[first]) and pd.notnull(row[second]):
        return (row[second] - row[first]).days
    else:
        return -1

In [9]:
# Convert stage dates to month day
# jjj
def StageDates_month_day(df):
    for s in stages:
        stage_date_field = 'Date_' + s
        df[stage_date_field] = df[stage_date_field].dt.strftime('%B %d')
    return df

In [10]:
# Take the two GPA fields: Admit GPA and GPA and create one field
def Merge_GPA(row):
    fGPA = pd.isnull(row['GPA']) == True or row['GPA'] == 0
    fAGPA = pd.isnull(row['Admit_GPA']) == True or row['Admit_GPA'] == 0
    if fGPA == True and fAGPA == False:
        return row['Admit_GPA']
    elif fGPA == False:
        return row['GPA']
    else:
        return -1

In [11]:
# Create a field that only contains the term and not the year. This seperates fall from spring students
def Term_season(row):
    fall = 'Fall'
    spring = 'Spring'
    if fall in row['Term']:
        return fall 
    elif spring in row['Term']:
        return spring
    else: 
        return 'error'

In [12]:
# Create a field that calculates the age of the student when their application was created 
def Calculate_age(row):
    if pd.isnull(row['Birthdate']):
        return -1
    else:
        appCreatedDate = row['Application_Created_Date']
        born = row['Birthdate']
        age = appCreatedDate.year - born.year - (1 if (appCreatedDate.month, appCreatedDate.day) < (born.month, born.day) else 0) # jjj
        return age

In [13]:
def Final_cleaning(df):
    # Replace NaN with Unknown
    df.update(df[['School_Decision',
                  'Candidate_Decision',
                  'Closed_File_Disposition',
                  'Admit_Country',
                  'Admit_State/Province',
                  'Admit_City',
                  'Entrance_Type',
                  'Source_Code',
                  'National_Student_Status',
                  'Stream',
                  'Program_Of_Interest',
                  'Applying_for_Financial_Aid',
                  'Canada_Status',
                  'Country_of_Citizenship',
                  'Gender']].fillna('Unknown'))

    # Include only fields to be analyzed
    df = df.drop(['Birthdate',
                  'Admit_Street',
                  'Longitude',
                  'Latitude',
                  'Application_Created_Date',
                  'Date_File_Closed',
                  'GPA',
                  'Admit_GPA',
                  'Import_Date',
                  'AQ_Candidacy_ID',
                  '#_of_non-inquiry_Undergrad_Apps',
                  'Last_Name',
                  'SIS_ID',
                  'First_Name',
                  'Contact_ID', 
                  'Source_Code_Category',
                  'File_Forwarded_Deferred_Application',
                  'Closed_File_Disposition',
                  'Candidate_Decision',
                  'School_Decision',
                  'Date_Inquiry',
                  'Date_App_Started',
                  'Date_App_Submitted',
                  'Date_App_Complete',
                  'Date_Admit',
                  'Date_Deposit',
                  'Date_Pre-Enrolled',
                  'Date_Paid',
                  'Date_Enrolled',
                  'Days_at_Inquiry',
                  'Days_at_App_Started',
                  'Days_at_App_Submitted',
                  'Days_at_App_Complete',
                  'Days_at_Admit',
                  'Days_at_Deposit',
                  'Days_at_Pre-Enrolled',
                  'Days_at_Paid',
                  'Application_Number',
                  'Application_ID'
                 ], 
                   axis = 1)
    return df

In [14]:
def Balance_historic_data(df):
    # Balance the historic data between enrolled and non-enrolled
    df['Highest_Stage_Reached'] = df['Highest_Stage_Reached'].map({'App Submitted': 0,
                                                                   'Admit': 0,
                                                                   'App Started': 0,
                                                                   'App Complete': 0,
                                                                   'Deposit': 0,
                                                                   'Pre-Enrolled': 0,
                                                                   'Paid': 0,
                                                                   'Enrolled': 1})

    df_historic = df[df.Term != 'Fall 2018']
    df_current = df[df.Term == 'Fall 2018']
    df_enrolled = df_historic[df_historic.Highest_Stage_Reached == 1]
    df_nonenrolled = df_historic[df_historic.Highest_Stage_Reached == 0]
    
    df_nonenrolled = df_nonenrolled.sample(len(df_enrolled.index))
    df_historic = df_nonenrolled.append(df_enrolled)
    df = df_historic.append(df_current)
    return df

In [15]:
def Encode(df):
    historic = len(df[df.Term != 'Fall 2018'].index)
    current = len(df[df.Term == 'Fall 2018'].index)

    df = pd.get_dummies(df)
    
    df_historic, df_current = df.head(historic), df.tail(current)
    df_current = df_current.drop('Highest_Stage_Reached', axis = 1)
    df_target = df_historic['Highest_Stage_Reached']
    df_historic = df_historic.drop('Highest_Stage_Reached', axis = 1)
    
    X_train, X_test, y_train, y_test = train_test_split(df_historic, df_target, test_size = 0.2, random_state = 1)
    
    return X_train, X_test, y_train, y_test, df_current


In [16]:
def fit_predict_score(estimator, X_train, X_test, y_train, y_test, df_current):

    clf = estimator(probability=True) if 'probability' in estimator().get_params().keys() else estimator()
    clf.fit(X_train, y_train)

    Individual_prediction = clf.predict_proba(df_current) if 'predict_proba' in dir(estimator) else None
    Current_prediction = clf.predict(df_current)
    Enrolled_prediction = collections.Counter(Current_prediction)

    prediction = clf.predict(X_test)
    a = []
    b = []
    c = []
    for i in range(5):
        accuracy = accuracy_score(y_test, prediction)
        ra_score = roc_auc_score(y_test, prediction)
        f1_Score = f1_score(y_test, prediction)

        a.append(accuracy)
        accuracy_grouping = np.array(a)
        b.append(ra_score)
        ra_score_grouping = np.array(b)
        c.append(f1_Score)
        f1_Score_grouping = np.array(c)

    CV_Score = cross_val_score(clf, X_train, y_train, cv = 5)

    return Individual_prediction, Enrolled_prediction, f1_Score_grouping, accuracy_grouping, ra_score_grouping, CV_Score

In [17]:
df = Load_data('ERx_Jul_5_NoSpring.csv')
df['Distance_From_TWU'] = df.apply(Haversine, axis = 1)
df = Drop_dupes(df)
df = Dates_to_datetime(df)

In [18]:
# jjj made global at the top. stages = ['Inquiry','App_Started','App_Submitted','App_Complete','Admit','Deposit','Pre-Enrolled','Paid']
for s in stages[:-1]: # jjj
    newcolumn = 'Days_at_' + s
    df[newcolumn] = df.apply(Calculate_days_at_stage, stage=s, stages=stages, algo1=Get_relevant_stages, axis = 1)

In [19]:
df = StageDates_month_day(df)
df['Merge_GPA'] = df.apply(Merge_GPA, axis = 1)
df['Term_season'] = df.apply(Term_season, axis = 1)
df['Age'] = df.apply(Calculate_age, axis = 1)

In [20]:
df.sample(9)

Unnamed: 0,SIS_ID,Application_ID,Application_Created_Date,Contact_ID,First_Name,Last_Name,Latitude,Longitude,Gender,#_of_non-inquiry_Undergrad_Apps,...,Days_at_App_Started,Days_at_App_Submitted,Days_at_App_Complete,Days_at_Admit,Days_at_Deposit,Days_at_Pre-Enrolled,Days_at_Paid,Merge_GPA,Term_season,Age
5006,560120.0,a0H1500000JhDus,2016-09-22,0031500001nYzIU,Shelby,Gingrich,49.189461,-123.149712,Female,2.0,...,0,-1,-1,-1,-1,-1,-1,3.0,Fall,18
12787,553036.0,a0H1C00000SJJln,2017-10-03,0031500001nYyC8,Anne-Marie,Sievu,49.15243,-122.76872,Female,2.0,...,0,0,0,0,0,105,0,1.01,Fall,-1
1659,493023.0,a0H1500000Jh5gA,2016-09-21,0031500001mTjxQ,Quinn,Brandt,,,Female,2.0,...,0,-1,-1,-1,-1,-1,-1,-1.0,Fall,20
12664,574135.0,a0H1500000JhFra,2016-09-22,0031500001nZ1gK,Alexander,Shevalev,49.28816,-123.11658,Male,1.0,...,0,0,6,11,7,17,0,3.3,Fall,21
7258,559698.0,a0H1500000JhDq5,2016-09-22,0031500001nYzE1,Daniel,Kitenge Mushalame,,,Female,1.0,...,0,-1,-1,-1,-1,-1,-1,-1.0,Fall,27
7609,536885.0,a0H1500000JhAdy,2016-09-22,0031500001nYwZR,Clara,Lai,,,Female,1.0,...,0,33,0,-1,-1,-1,-1,4.3,Fall,19
5833,465239.0,a0H1500000Jh3Vi,2016-09-21,0031500001mTh4K,Chloe,Heuchert,,,Female,2.0,...,0,64,13,90,11,103,0,3.0,Fall,20
8068,491771.0,a0H1500000Jh5Rv,2016-09-21,0031500001mTjkY,Michael,Li,,,Male,1.0,...,0,79,0,-1,-1,-1,-1,4.3,Fall,20
2895,538208.0,a0H1500000JhAtQ,2016-09-22,0031500001nYwn0,Allison,Cortes,,,Female,1.0,...,0,56,0,-1,-1,-1,-1,3.7,Fall,20


In [21]:
df_view_results = Final_cleaning(df)
df_view_results = Balance_historic_data(df_view_results)

In [22]:
#df_view_results = df_view_results.sort_values(by=['Term','Application_ID'])
df_view_results.tail(9)

Unnamed: 0,Gender,Term,Country_of_Citizenship,Canada_Status,Aboriginal_Student,Applying_for_Financial_Aid,Program_Of_Interest,Stream,National_Student_Status,Highest_Stage_Reached,Source_Code,Entrance_Type,Admit_City,Admit_State/Province,Admit_Country,Current_Region,Distance_From_TWU,Merge_GPA,Term_season,Age
15601,Male,Fall 2018,China,Visitor VISA,0,Yes,Art and Design,Unknown,International,0,PSIBC,New,Salmon Arm,British Columbia,Canada,UG BC Revelstoke Golden,296.388534,2.3,Fall,18
15602,Female,Fall 2018,China,Unknown,0,No,Pre-Engineering,Unknown,International,0,Unknown,New,Pitt Meadows,British Columbia,Canada,UG North of Fraser,10.184348,-1.0,Fall,19
15603,Male,Fall 2018,Canada,Unknown,0,Yes,Nursing,Unknown,Canadian,0,Stealth Application,New,Brantford,Ontario,Canada,UG Ontario,3283.345382,-1.0,Fall,17
15609,Female,Fall 2018,Canada,Unknown,0,Yes,Worship Arts,Unknown,Canadian,0,Unknown,Transfer,Gloucester,Ontario,Canada,UG Transfer Canada East Fall,-1.0,3.36,Fall,22
15610,Female,Fall 2018,Canada,Unknown,0,No,Business Administration (B.B.A.),International Business,Canadian,0,PSIBC,New,Chilliwack,British Columbia,Canada,UG Eastern Fraser Valley,56.906737,4.3,Fall,17
15612,Female,Fall 2018,Canada,Unknown,0,Yes,Biology,Unknown,Canadian,0,Stealth Application,New,Brooks,Alberta,Canada,UG Alberta,782.701864,4.3,Fall,17
15613,Female,Fall 2018,Canada,Unknown,0,Yes,Nursing,Unknown,Canadian,0,Stealth Application,Transfer,Aurora,Ontario,Canada,UG Transfer Canada East Fall,3301.626071,3.7,Fall,18
15615,Female,Fall 2018,Canada,Unknown,0,No,Nursing,Unknown,Canadian,0,Unknown,Transfer,Ottawa,Ontario,Canada,UG Transfer Canada East Fall,3506.637171,2.0,Fall,26
15616,Female,Fall 2018,Canada,Unknown,0,Yes,Nursing,Unknown,Canadian,0,Dan Bremnes 2017,New,Grand Prairie,Alberta,Canada,UG Alberta,-1.0,4.0,Fall,16


In [23]:
X_train, X_test, y_train, y_test, df_current = Encode(df_view_results)

In [24]:
df_current.tail(9)

Unnamed: 0,Aboriginal_Student,Distance_From_TWU,Merge_GPA,Age,Gender_Female,Gender_M,Gender_Male,Gender_Unknown,Term_Fall 2014,Term_Fall 2015,...,Current_Region_UG Vancouver,Current_Region_UG Washington,Current_Region_UG Washington North,Current_Region_UG Washington North Transfer,Current_Region_UG Washington South,Current_Region_UG Washington South Transfer,Current_Region_UG Washington Transfer,Current_Region_UG Western USA,Current_Region_UG Western USA Transfer,Term_season_Fall
15601,0,296.388534,2.3,18,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15602,0,10.184348,-1.0,19,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15603,0,3283.345382,-1.0,17,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15609,0,-1.0,3.36,22,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15610,0,56.906737,4.3,17,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15612,0,782.701864,4.3,17,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15613,0,3301.626071,3.7,18,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15615,0,3506.637171,2.0,26,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15616,0,-1.0,4.0,16,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [25]:
X_train.sample(9)

Unnamed: 0,Aboriginal_Student,Distance_From_TWU,Merge_GPA,Age,Gender_Female,Gender_M,Gender_Male,Gender_Unknown,Term_Fall 2014,Term_Fall 2015,...,Current_Region_UG Vancouver,Current_Region_UG Washington,Current_Region_UG Washington North,Current_Region_UG Washington North Transfer,Current_Region_UG Washington South,Current_Region_UG Washington South Transfer,Current_Region_UG Washington Transfer,Current_Region_UG Western USA,Current_Region_UG Western USA Transfer,Term_season_Fall
11414,0,-1.0,3.3,20,0,0,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
15189,0,-1.0,-1.0,-1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
4887,0,3648.777522,-1.0,20,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5862,0,-1.0,4.0,20,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
12276,0,26.648209,4.0,18,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7071,0,-1.0,3.94,24,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1910,0,-1.0,-1.0,20,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3633,0,21.517484,3.0,19,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5153,0,-1.0,4.3,21,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
estimator = SVC
Individual_prediction, Enrolled_prediction, f1_Score_grouping, accuracy_grouping, ra_score_grouping, CV_Score = fit_predict_score(estimator, X_train, X_test, y_train, y_test, df_current)

In [107]:
indiv = pd.DataFrame(Individual_prediction[:,1])
#indiv.head(9)
current = df_view_results[df.Term == 'Fall 2018']
current.sample(9)

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Application_ID,Gender,Term,Country_of_Citizenship,Canada_Status,Aboriginal_Student,Applying_for_Financial_Aid,Program_Of_Interest,Stream,National_Student_Status,...,Source_Code,Entrance_Type,Admit_City,Admit_State/Province,Admit_Country,Current_Region,Distance_From_TWU,Merge_GPA,Term_season,Age
6628,a0H1C00000crDgr,Male,Fall 2018,Canada,Unknown,0,Unknown,Game Development,Unknown,Canadian,...,Unknown,New,Abbotsford,British Columbia,Canada,UG Abbotsford,-1.0,2.7,Fall,18
1356,a0H1C00000XvYgD,Male,Fall 2018,Pakistan,Unknown,0,No,Business Administration (B.B.A.),Unknown,International,...,Unknown,New,Sohar,Unknown,Oman,UG International Students,-1.0,2.7,Fall,18
4871,a0H1C00000UdLuj,Male,Fall 2018,Canada,Unknown,0,No,Business Administration (B.A.),Unknown,Canadian,...,Stealth Application,New,Surrey,British Columbia,Canada,UG Surrey 1,13.157022,3.0,Fall,17
14111,a0H1500000N3lEi,Male,Fall 2018,Canada,Unknown,0,No,Nursing,Unknown,Canadian,...,PSIBC,Transfer,Burnaby,British Columbia,Canada,UG Burnaby,25.799895,4.0,Fall,17
6310,a0H1C00000VoBjB,Female,Fall 2018,Canada,Unknown,0,Yes,Sport and Leisure Management,Unknown,Canadian,...,Stealth Application,New,Oakville,Ontario,Canada,UG Ontario,3309.092688,4.0,Fall,17
11661,a0H1C00000Ud0Dq,Unknown,Fall 2018,Canada,Unknown,0,Yes,Pre-Medicine,Unknown,Canadian,...,Stealth Application,New,Blind Bay,British Columbia,Canada,UG BC Revelstoke Golden,301.159057,-1.0,Fall,17
12770,a0H1500000Q6WyK,Male,Fall 2018,Canada,Unknown,0,No,Nursing,Unknown,Canadian,...,Stealth Application,Transfer,Surrey,British Columbia,Canada,UG Surrey 1,11.630703,3.7,Fall,18
12539,a0H1500000N1v09,Female,Fall 2018,Canada,Unknown,0,No,Nursing,Unknown,Canadian,...,PSIBC,New,Surrey,British Columbia,Canada,UG Surrey 1,15.20775,3.0,Fall,16
12721,a0H1C00000XwxyM,Female,Fall 2018,Canada,Unknown,0,Yes,Psychology,Unknown,Canadian,...,Unknown,Transfer,Calgry,Alberta,Canada,UG Transfer Canada East Fall,636.370775,1.91,Fall,19


In [108]:
print(str(estimator),
    "\nNumber of students predicted to enroll: ", Enrolled_prediction,
    "\nf1 Score: %0.2f (+/- %0.2f)" % (f1_Score_grouping.mean(), f1_Score_grouping.std() * 2),
    "\nAccuracy of prediction: %0.2f (+/- %0.2f)" % (accuracy_grouping.mean(), accuracy_grouping.std() * 2),
    "\nROC AUC score: %0.2f (+/- %0.2f)" % (ra_score_grouping.mean(), ra_score_grouping.std() * 2),
    "\nCV_Accuracy: %0.2f (+/- %0.2f)" % (CV_Score.mean(), CV_Score.std() * 2),
    "\n" + "-"*80)

<class 'sklearn.naive_bayes.BernoulliNB'> 
Number of students predicted to enroll:  Counter({0: 1978, 1: 1414}) 
f1 Score: 0.75 (+/- 0.00) 
Accuracy of prediction: 0.73 (+/- 0.00) 
ROC AUC score: 0.73 (+/- 0.00) 
CV_Accuracy: 0.73 (+/- 0.03) 
--------------------------------------------------------------------------------


In [109]:
for estimator in [BernoulliNB, GaussianNB, LinearSVC, LogisticRegression, SVC]:
    Individual_prediction, Enrolled_prediction, f1_Score_grouping, accuracy_grouping, ra_score_grouping, CV_Score = fit_predict_score(estimator, X_train, X_test, y_train, y_test, df_current)
    print(str(estimator),
        "\nNumber of students predicted to enroll: ", Enrolled_prediction,
        "\nf1 Score: %0.2f (+/- %0.2f)" % (f1_Score_grouping.mean(), f1_Score_grouping.std() * 2),
        "\nAccuracy of prediction: %0.2f (+/- %0.2f)" % (accuracy_grouping.mean(), accuracy_grouping.std() * 2),
        "\nROC AUC score: %0.2f (+/- %0.2f)" % (ra_score_grouping.mean(), ra_score_grouping.std() * 2),
        "\nCV_Accuracy: %0.2f (+/- %0.2f)" % (CV_Score.mean(), CV_Score.std() * 2),
        "\n" + "-"*80)

<class 'sklearn.naive_bayes.BernoulliNB'> 
Number of students predicted to enroll:  Counter({0: 1978, 1: 1414}) 
f1 Score: 0.75 (+/- 0.00) 
Accuracy of prediction: 0.73 (+/- 0.00) 
ROC AUC score: 0.73 (+/- 0.00) 
CV_Accuracy: 0.73 (+/- 0.03) 
--------------------------------------------------------------------------------
<class 'sklearn.naive_bayes.GaussianNB'> 
Number of students predicted to enroll:  Counter({1: 2215, 0: 1177}) 
f1 Score: 0.71 (+/- 0.00) 
Accuracy of prediction: 0.63 (+/- 0.00) 
ROC AUC score: 0.62 (+/- 0.00) 
CV_Accuracy: 0.63 (+/- 0.04) 
--------------------------------------------------------------------------------
<class 'sklearn.svm.classes.LinearSVC'> 
Number of students predicted to enroll:  Counter({1: 2323, 0: 1069}) 
f1 Score: 0.77 (+/- 0.00) 
Accuracy of prediction: 0.73 (+/- 0.00) 
ROC AUC score: 0.73 (+/- 0.00) 
CV_Accuracy: 0.74 (+/- 0.02) 
--------------------------------------------------------------------------------
<class 'sklearn.linear_model.lo