# Using Machine Learning to Predict School Dropout in India

We import the required packages

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 1500)

import numpy as np
import os
import time
import sys
import pickle

We mount the Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Then we set the working directories

In [None]:
wdir = os.path.join('/your','directory','to','the','project','folder')
input_dir = os.path.join(wdir,'data','input')
outut_dir = os.path.join(wdir,'data','output')

## Individual Level Data

We read the India Human Development Survey data. This dataset contains both waves (2005 and 2001-20012) of the IHDS. Mong various topics, it covers education and has extensive information about current students. It is also possible to link information from school and village survey.

In [None]:
data_ind_raw = pd.read_feather(os.path.join(input_dir,'ind_data_2.feather'))

### Data Cleaning and Variable Selection

This data has many interesting variables, but only a small subset is relevant for our problem given that this information should be available to school principals. We select interesting variables and rename them so that their meaning can be inferred directly from the variable name.

In [None]:
ind_rename_dict = {'HHBASE':'hh_id',
                   'HHSPLITID':'hh_split_id',
                   'PBASE':'person_id',
                   'STATEID':'state_id',
                   'DISTID':'district_id',
                   'PSUID':'village_id',
                   'PWAVES':'waves_present',
                   'SURVEY':'wave',
                   'URBAN4':'area_type',
                   'METRO6':'largest_6_metros',
                   'RO3':'sex', 
                   'RO5':'age', 
                   'RO9':'father_id',
                   'RO10':'mother_id',
                   'ED5':'enrolled',
                   'ED6':'completed_edu_years',
                   'ED7':'ever_repeated',
                   'CS4':'school_type',
                   'CS5':'school_distance',
                   'CS9':'year_eng_taught',
                   'CS10':'school_hrs_week',
                   'CS11':'hw_hrs_week',
                   'CS12':'pvt_tuitions_hrs_week',
                   'CS13':'days_absent',
                   'CS14Y':'mid_day_meal',
                   'CS21':'free_books',
                   'CS22':'govt_school_fees',
                   'CS23':'free_uniform',
                   'CS24Y':'scholarship',
                   'CS25':'spent_school_fees',
                   'CS26_27':'spent_other',
                   'CS28':'spent_pvt_tuitions',
                   'CH15':'average_student',
                   'CH17':'num_of_repeats',
                   'CH18':'ever_praised',
                   'CH19':'ever_beaten'
                  }

In [None]:
data_ind = data_ind_raw[ind_rename_dict.keys()]
data_ind_renamed = data_ind.rename(columns=ind_rename_dict)

We create a unique idetifier for each individual and we create two new variables to link respondents to their parents.

In [None]:
data_ind_renamed['hh_id'] = data_ind_renamed['hh_id'].astype(int).astype(str)
data_ind_renamed['person_id'] = data_ind_renamed['person_id'].astype(int).astype(str)

In [None]:
data_ind_renamed['respid'] = data_ind_renamed['hh_id'] + data_ind_renamed['person_id']
data_ind_renamed['father_respid'] = data_ind_renamed['hh_id'] + data_ind_renamed['father_id'].astype(str)
data_ind_renamed['mother_respid'] = data_ind_renamed['hh_id'] + data_ind_renamed['mother_id'].astype(str)

In [None]:
data_ind_renamed['father_respid'] = data_ind_renamed['father_respid'].str.replace('nan','').str.replace('\.0','')
data_ind_renamed['mother_respid'] = data_ind_renamed['mother_respid'].str.replace('nan','').str.replace('\.0','')

We know recode categorical variables, assigning integer values to each category. While recoding the data in this way we also create a dictionary to keep track of the mapping between categories and integer values.

In [None]:
def recode_cats(data):

    '''
        Returns a data frame where categorical columns have been converted to integers and
        a dictionary that maps categories to numerical values.
        Arguments:
            data is a dataframe
        Returns:
            a dataframe and a dictionary
    '''

    recode_dict = {}
    cat_cols = data.columns[data.dtypes == 'category']

    for col in cat_cols:
        categories = data[col].cat.categories
        recode_dict[col] = dict(zip(categories,range(len(categories))))
    
    data = data.replace(to_replace = recode_dict)
    
    return (data,recode_dict)

In [None]:
data_ind_recoded, recode_ind_dict = recode_cats(data_ind_renamed)

We won't be using information from the second wave (at least for the moment). We thus subset the data to keep only observation from wave 1. The idea is that we cannot build a model to predict dropout using information from the future (relative to the time of prediction).

In [None]:
data_ind_w1 = data_ind_recoded[data_ind_recoded.wave == 0].copy()

## Linking Parents' Info

As I mentioned before, we wish to merge individual information with parents' information which should prove useful to model each kid's social and economic background. To create the combined dataframe, we first create two separate dataframes for mothers and fathers, then, using the unique ids we generated in the previous step, we can merge all individuals with their parents (if they are in the data).

In [None]:
parents_features = ['respid','age','completed_edu_years']

data_ind_w1 = data_ind_w1.merge(data_ind_w1[parents_features].rename(columns={'respid':'father_respid',
                                                                              'age':'father_age',
                                                                              'completed_edu_years':'father_edu'}),
                                on='father_respid',
                                how='left')

data_ind_w1 = data_ind_w1.merge(data_ind_w1[parents_features].rename(columns={'respid':'mother_respid',
                                                                              'age':'mother_age',
                                                                              'completed_edu_years':'mother_edu'}),
                                on='mother_respid',
                                how='left')

## Who's at Risk of Dropping Out?

Only kids who were still in education in the first wave are at risk of droppng out. We are thus not interested in individuals who were out of education in the first wave, and can drop them from the sample.

In [None]:
students_data = data_ind_w1[data_ind_w1.enrolled == 1]

The only information from wave 2 we want to preserve is whether the respondent is still enrolled in education and how many years they have completed. This infomation allows us to identify respondents who dropped out in the period between waves and to determine at what grade they did so. 

In [None]:
enrolled_w2 = data_ind_recoded.loc[data_ind_recoded.wave == 1,
                                   ['respid','enrolled','completed_edu_years']].rename(columns={'enrolled':'enrolled_w2',
                                                                                                'completed_edu_years':'final_edu'})

In [None]:
students_data = students_data.merge(enrolled_w2,on='respid')

To construct our target variable, we need to know whether students enrolled in wave 1 left education before wave 2 and when they left. We thus drop observations for which this information is not available. 

In [None]:
students_data = students_data.dropna(subset=['enrolled_w2','final_edu'])

## Household Level Data

We have a set of variables for the kids, and their parents. There is however, a third set of variables which may prove relevant, information on the kid's household. This information is stored in a different dataset, which we thus need to import, clean, and then merge.

In [None]:
data_hh_raw = pd.read_feather(os.path.join(input_dir,'house_data_1.feather'))

We again select just a subset of the available variables and give them intuitive names.

In [None]:
hh_rename_dict = {'HHBASE':'hh_id',
                  'XGROUPS6':'caste_religion',
                  'XID14':'main_income_source',
                  'XID15':'years_in_place',
                  'XDB5':'total_debt',
                  'XCI7S':'confidence_schools',
                  'XASSETS5':'std_of_living_quint',
                  'XINCOME5':'income_quint',
                  'XNPERSONS':'hh_size',
                  'XNCHILDM':'boys_0_14',
                  'XNCHILDF':'girls_0_14',
                  'XNTEENM':'boys_15_21',
                  'XNTEENF':'girls_15_21',
                  'XNELDERM':'men_over_60',
                  'XNELDERF':'women_over_60',
                  'XNWKSALARY':'num_emp_with_salary',
                  'XCG1':'owns_house',
                  'XCG4':'owns_bicycle',
                  'XCGVEHICLE':'owns_vehicle',
                  'XCG5':'owns_sewing_mac',
                  'XCG6':'owns_generator',
                  'XCG7':'owns_mixer',
                  'XCG8':'owns_motor_cycle',
                  'XCGMOTORV':'owns_motor_vehicle',
                  'XCGTV':'owns_tv',
                  'XCG11':'owns_air_cooler', 
                  'XCG12':'owns_watch', 
                  'XCG13':'owns_electric_fan',
                  'XCG14':'owns_chair_table',
                  'XCG15':'owns_cot',
                  'XCG16':'owns_telephone',
                  'XCG17':'owns_mobile_phone',
                  'XCG18':'owns_fridge',
                  'XCG19':'owns_pressure_cooker',
                  'XCG23':'owns_washing_mac',
                  'XCG24':'owns_computer',
                  'XCG26':'owns_credit_card',
                  'XCG28':'owns_two_clothes',
                  'XCG29':'owns_footwear'
                 }

In [None]:
data_hh = data_hh_raw[hh_rename_dict.keys()]
data_hh_renamed = data_hh.rename(columns=hh_rename_dict)
data_hh_renamed['hh_id'] = data_hh_renamed['hh_id'].astype(int).astype(str)

There are some households with multiple records, we thus remove duplicates.

In [None]:
data_hh_renamed = data_hh_renamed.drop_duplicates(['hh_id','std_of_living_quint','income_quint','hh_size'])

We recode the categorical variables with numeric values and store the mapping into a dictionary as before.

In [None]:
data_hh_recoded, recode_hh_dict = recode_cats(data_hh_renamed)

Finally, we merge the houseold data with the students' data

In [None]:
students_data = students_data.merge(data_hh_recoded, on='hh_id', how='left')

## Building the Target Variable

We are finally ready to build the target variable: whether a kid left education before grade 9.

In [None]:
students_data['left_edu'] = students_data['enrolled'] - students_data['enrolled_w2'] 

In [None]:
left_before_9th_old = (students_data['left_edu'] == 1) & (students_data['completed_edu_years'] <9)

In [None]:
left_before_9th = (students_data['left_edu'] == 1) & (students_data['final_edu'] <9)

In [None]:
students_data['left_before_9th'] = students_data['left_edu'].where(left_before_9th,0)

## Selecting Predictors and Cleaning the Data
We select only some of the the predictors based on what variables seem relevant and which ones are likely to be potentially available to school principals. You'll see an additional list of additional household predictors. I have tried adding those to the models but the performance did not significantly improve. In the interest of parsimony, I decided to drop them.

In [None]:
ind_predictors = ['state_id','area_type','largest_6_metros','sex','age',
                  'ever_repeated','school_type','school_distance','year_eng_taught',
                  'school_hrs_week','hw_hrs_week','pvt_tuitions_hrs_week','days_absent',
                  'mid_day_meal','free_books','govt_school_fees','free_uniform','scholarship',
                  'spent_school_fees','spent_other','spent_pvt_tuitions','average_student',
                  'num_of_repeats','ever_praised','ever_beaten']

parent_predictors = ['father_age','father_edu','mother_age','mother_edu']

hh_predictors = ['std_of_living_quint','income_quint','hh_size','caste_religion','main_income_source',
                 'years_in_place','total_debt','confidence_schools','boys_0_14','girls_0_14',
                 'boys_15_21','girls_15_21','men_over_60','women_over_60','num_emp_with_salary']

add_hh_predictors = ['owns_house','owns_bicycle','owns_vehicle','owns_sewing_mac',
                     'owns_generator','owns_mixer','owns_motor_cycle','owns_motor_vehicle',
                     'owns_tv','owns_air_cooler','owns_watch','owns_electric_fan',
                     'owns_chair_table','owns_cot','owns_telephone','owns_mobile_phone',
                     'owns_fridge','owns_pressure_cooker','owns_washing_mac','owns_computer',
                     'owns_credit_card','owns_two_clothes','owns_footwear']

predictors = ind_predictors + parent_predictors + hh_predictors

cat_exceptions = ['year_eng_taught']

The preprocessing function performs the following operations on the data:

1.   It standardises continous features using the mean and the standard deviation for the training set;
2.   I fills NA for continuous features with the training set mean;
3.   It fills NA for categorical features with the value 9999;
4.   It applies one-hot-encoding to all categorical features.



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

def preprocess(data,cat_cols,test=False,scaler=None):
    
    '''
        Returns the original numpy array on which we have performed as set of preprocessing operations.
        Arguments:
            data is a dataframe
            cat_cols is a list-like object listing categorical columns in data
            test is a boolean indicating whether the data passed is from the test set
            scaler is an instance of scikit learn StandardScaler class, needed only for the test set
        Returns:
            a numpy array
            the scaler
    '''
    
    non_cat_cols = [col for col in data.columns if col not in cat_cols]
    
    cat_features = data[cat_cols]
    non_cat_features = data[non_cat_cols]
    
    if test:
        non_cat_features_st = pd.DataFrame(scaler.transform(non_cat_features))
    else:
        scaler = StandardScaler()
        non_cat_features_st = pd.DataFrame(scaler.fit_transform(non_cat_features))
        
    non_cat_features_st.columns = non_cat_features.columns
    non_cat_features_st = non_cat_features_st.fillna(0)
    
    cat_features = cat_features.fillna(9999)
    dummy_cat_features = pd.get_dummies(cat_features,columns=cat_cols).reset_index(drop=True)

    prepro_data = pd.concat([non_cat_features_st,dummy_cat_features],axis=1)
        
    return (prepro_data,scaler)

We import some additional modules from sklearn and create the preprocessed training and test sets using a 0.9 - 0.1 split.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn import tree
from sklearn import svm

X = students_data.loc[:,predictors]
y = students_data.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

cat_cols = [col for col in X_train.columns if col in recode_ind_dict.keys()]
cat_cols = cat_cols + [col for col in X_train.columns if col in recode_hh_dict.keys()]
cat_cols = list(set(cat_cols) - set(cat_exceptions))
X_train_pp, scaler = preprocess(X_train, cat_cols)
X_test_pp, _ = preprocess(X_test, cat_cols, test=True, scaler=scaler)

The last thing we need to do is to build a performance dictionary to store the performace metrics for all the models.

In [None]:
perform_dict = {}

In [None]:
def find_treshold(clf,X,y,is_nn=False):

  recall = 0
  threshold = 0
  change = 0.00001
  i = 0
  update = 1

  if is_nn:
    scores = model(X.to(device))

  while not (recall >= 0.7 and recall < 0.72):

    if abs(update) < 0.001:
      change *= 1.1
    if abs(update) > 0.005:
      change *= 0.9

    if recall - 0.7 > 0:
      threshold -= change
    else:
      threshold += change

    if is_nn:
      y_pred = torch.where(scores[:,0] > threshold, 0, 1).cpu().detach().numpy()
    else:
      y_pred = np.where(clf.predict_proba(X)[:,0] > threshold, 0, 1)

    update = recall_score(y, y_pred) - recall
    recall += update
    i +=1

    if i%10 == 9:
      print(f'Threshold: {threshold}, Recall: {recall}')

  print(f'Final Threshold: {threshold}, Final Recall: {recall}')
  return threshold


## Defining the Baseline

Because this is a binary classification task, a decision three seems a good model to use as the baseline. I tuned the three's depth with cross validation and selected a value of 7 and, given this value, I select the cost-complexity-pruning alpha paramer using the `cost_complexity_pruning_path` function in sklearn and 5-folds cross validation.

In [None]:
def automatic_dt_pruning(dt_classifier, X, y):
    """
    Returns the pruning parameter (i.e., ccp_alpha) with the highest cross-validated accuracy

    Args:
        dt_classifier           : An Sklearn DecisionTreeClassifier (e.g., created by "tree.DecisionTreeClassifier(criterion='entropy')")      
        X (Pandas.DataFrame)    : Input Features
        y (Pandas.Series)       : Labels


    Returns:
        best_ccp_alpha : Tuned pruning paramter with highest cross-validated accuracy

    Notes:
        1. Don't change any other Decision Tree Classifier parameters other than ccp_alpha
        2. Use the sklearn.model_selection.cross_val_score to find the cross-validation accuracies
        3. For cross_val_score, please use 5-fold cross validation
    """

    np.random.seed(42)
    ccp_alphas = dt_classifier.cost_complexity_pruning_path(X, y)['ccp_alphas']
    scores = []
    print(f'Trying {len(ccp_alphas)} values for alpha')

    for ccp_alpha in ccp_alphas :
      dt_classifier.set_params(ccp_alpha=ccp_alpha)
      score = cross_val_score(dt_classifier,X,y,cv=5).mean()
      scores.append(score)

    return ccp_alphas[np.argmax(scores)]

In [None]:
clf = DecisionTreeClassifier(random_state=0, max_depth=7, class_weight='balanced')
#ccp_alpha = automatic_dt_pruning(clf, X_train_pp, y_train)
print(f'The best value for ccp alpha is: {ccp_alpha}')
clf.set_params(ccp_alpha=ccp_alpha)
clf.fit(X_train_pp,y_train)

In [None]:
threshold = find_treshold(clf,X_test_pp,y_test)
start_time = time.time()
y_train_pred = np.where(clf.predict_proba(X_train_pp)[:,0] > threshold, 0, 1)
y_test_pred = np.where(clf.predict_proba(X_test_pp)[:,0] > threshold, 0, 1)
prediction_time = round(time.time() - start_time,3)

In [None]:
p = pickle.dumps(clf)
model_size = sys.getsizeof(p)/(1024**2)

accuracy_train = round(accuracy_score(y_train, y_train_pred),3)
precision_train = round(precision_score(y_train, y_train_pred),3)
recall_train = round(recall_score(y_train, y_train_pred),3)
f1_score_train = round(f1_score(y_train, y_train_pred),3)
roc_auc_train = round(roc_auc_score(y_train, y_train_pred),3)
pct_positive_train = round((sum(y_train_pred)/len(y_train_pred))*100,3)

accuracy_test = round(accuracy_score(y_test, y_test_pred),3)
precision_test = round(precision_score(y_test, y_test_pred),3)
recall_test = round(recall_score(y_test, y_test_pred),3)
f1_score_test = round(f1_score(y_test, y_test_pred),3)
roc_auc_test = round(roc_auc_score(y_test, y_test_pred),3)
pct_positive_test = round((sum(y_test_pred)/len(y_test_pred))*100,3)

perform_dict['Decision Tree'] = {'Training':{'Accuracy':accuracy_train,
                                             'Recall':recall_train,
                                              'Precision':precision_train,
                                              'F1-Score':f1_score_train,
                                              'ROC AUC':roc_auc_train,
                                              'Pct. Positive':pct_positive_train},
                                  'Test':{'Accuracy':accuracy_test,
                                          'Recall':recall_test,
                                          'Precision':precision_test,
                                          'F1-Score':f1_score_test,
                                          'ROC AUC':roc_auc_test,
                                          'Pct. Positive':pct_positive_test},
                                  'Model':{'Prediction Time':prediction_time,
                                           'Model Size':model_size}}

print(f'Model Prediction Time: {prediction_time} Seconds')
if model_size > 1:
  print(f'Model Size: {round(model_size,3)} MB')
else:
  print(f'Model Size: {round(model_size*1024)} KB')

print(f'We are classifying {pct_positive_test}% of the test observations as positive.\n')

print(f'Test Accuracy: {accuracy_test}, Test F1 Score: {f1_score_test}')
print(f'Test Precision: {precision_test}, Test Recall: {recall_test}')
print(f'Test Area Under the ROC Curve: {roc_auc_test}\n')

print(f'We are classifying {pct_positive_train}% of the training observations as positive.\n')

print(f'Train Accuracy: {accuracy_train}, Train F1 Score: {f1_score_train}')
print(f'Train Precision: {precision_train}, Train Recall: {recall_train}')

print(f'Train Area Under the ROC Curve: {roc_auc_train}')

## Exploring Different Models

The first alternative algorithm we tried is AdaBoost with 500 logistic regressions as base estimators.

In [None]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(base_estimator=LogisticRegression(),
                         n_estimators=500, random_state=0)

clf.fit(X_train_pp,y_train)

In [None]:
threshold = find_treshold(clf,X_test_pp,y_test)
start_time = time.time()
y_train_pred = np.where(clf.predict_proba(X_train_pp)[:,0] > threshold, 0, 1)
y_test_pred = np.where(clf.predict_proba(X_test_pp)[:,0] > threshold, 0, 1)
prediction_time = round(time.time() - start_time,3)

In [None]:
p = pickle.dumps(clf)
model_size = sys.getsizeof(p)/(1024**2)

accuracy_train = round(accuracy_score(y_train, y_train_pred),3)
precision_train = round(precision_score(y_train, y_train_pred),3)
recall_train = round(recall_score(y_train, y_train_pred),3)
f1_score_train = round(f1_score(y_train, y_train_pred),3)
roc_auc_train = round(roc_auc_score(y_train, y_train_pred),3)
pct_positive_train = round((sum(y_train_pred)/len(y_train_pred))*100,3)

accuracy_test = round(accuracy_score(y_test, y_test_pred),3)
precision_test = round(precision_score(y_test, y_test_pred),3)
recall_test = round(recall_score(y_test, y_test_pred),3)
f1_score_test = round(f1_score(y_test, y_test_pred),3)
roc_auc_test = round(roc_auc_score(y_test, y_test_pred),3)
pct_positive_test = round((sum(y_test_pred)/len(y_test_pred))*100,3)

perform_dict['AdaBoost'] = {'Training':{'Accuracy':accuracy_train,
                                        'Recall':recall_train,
                                              'Precision':precision_train,
                                              'F1-Score':f1_score_train,
                                              'ROC AUC':roc_auc_train,
                                              'Pct. Positive':pct_positive_train},
                                  'Test':{'Accuracy':accuracy_test,
                                          'Recall':recall_test,
                                          'Precision':precision_test,
                                          'F1-Score':f1_score_test,
                                          'ROC AUC':roc_auc_test,
                                          'Pct. Positive':pct_positive_test},
                                  'Model':{'Prediction Time':prediction_time,
                                           'Model Size':model_size}}

print(f'Model Prediction Time: {prediction_time} Seconds')
if model_size > 1:
  print(f'Model Size: {round(model_size,3)} MB')
else:
  print(f'Model Size: {round(model_size*1024)} KB')

print(f'We are classifying {pct_positive_test}% of the test observations as positive.\n')

print(f'Test Accuracy: {accuracy_test}, Test F1 Score: {f1_score_test}')
print(f'Test Precision: {precision_test}, Test Recall: {recall_test}')
print(f'Test Area Under the ROC Curve: {roc_auc_test}\n')

print(f'We are classifying {pct_positive_train}% of the training observations as positive.\n')

print(f'Train Accuracy: {accuracy_train}, Train F1 Score: {f1_score_train}')
print(f'Train Precision: {precision_train}, Train Recall: {recall_train}')

print(f'Train Area Under the ROC Curve: {roc_auc_train}')

We then tried with a Random Forest classifier with 1000 base estimators with a max depth of 2. Random Forest has be successful in the past for this type of task.

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=3,class_weight='balanced',
                             n_estimators=1000, random_state=0)

clf.fit(X_train_pp,y_train)

In [None]:
threshold = find_treshold(clf,X_test_pp,y_test)
start_time = time.time()
y_train_pred = np.where(clf.predict_proba(X_train_pp)[:,0] > threshold, 0, 1)
y_test_pred = np.where(clf.predict_proba(X_test_pp)[:,0] > threshold, 0, 1)
prediction_time = round(time.time() - start_time,3)

In [None]:
p = pickle.dumps(clf)
model_size = sys.getsizeof(p)/(1024**2)

accuracy_train = round(accuracy_score(y_train, y_train_pred),3)
precision_train = round(precision_score(y_train, y_train_pred),3)
recall_train = round(recall_score(y_train, y_train_pred),3)
f1_score_train = round(f1_score(y_train, y_train_pred),3)
roc_auc_train = round(roc_auc_score(y_train, y_train_pred),3)
pct_positive_train = round((sum(y_train_pred)/len(y_train_pred))*100,3)

accuracy_test = round(accuracy_score(y_test, y_test_pred),3)
precision_test = round(precision_score(y_test, y_test_pred),3)
recall_test = round(recall_score(y_test, y_test_pred),3)
f1_score_test = round(f1_score(y_test, y_test_pred),3)
roc_auc_test = round(roc_auc_score(y_test, y_test_pred),3)
pct_positive_test = round((sum(y_test_pred)/len(y_test_pred))*100,3)

perform_dict['Random Forest'] = {'Training':{'Accuracy':accuracy_train,
                                             'Recall':recall_train,
                                              'Precision':precision_train,
                                              'F1-Score':f1_score_train,
                                              'ROC AUC':roc_auc_train,
                                              'Pct. Positive':pct_positive_train},
                                  'Test':{'Accuracy':accuracy_test,
                                          'Recall':recall_test,
                                          'Precision':precision_test,
                                          'F1-Score':f1_score_test,
                                          'ROC AUC':roc_auc_test,
                                          'Pct. Positive':pct_positive_test},
                                  'Model':{'Prediction Time':prediction_time,
                                           'Model Size':model_size}}

print(f'Model Prediction Time: {prediction_time} Seconds')
if model_size > 1:
  print(f'Model Size: {round(model_size,3)} MB')
else:
  print(f'Model Size: {round(model_size*1024)} KB')

print(f'We are classifying {pct_positive_test}% of the test observations as positive.\n')

print(f'Test Accuracy: {accuracy_test}, Test F1 Score: {f1_score_test}')
print(f'Test Precision: {precision_test}, Test Recall: {recall_test}')
print(f'Test Area Under the ROC Curve: {roc_auc_test}\n')

print(f'We are classifying {pct_positive_train}% of the training observations as positive.\n')

print(f'Train Accuracy: {accuracy_train}, Train F1 Score: {f1_score_train}')
print(f'Train Precision: {precision_train}, Train Recall: {recall_train}')

print(f'Train Area Under the ROC Curve: {roc_auc_train}')

The final standard ML model is Stacking Classifier with two layers:

1.   The first layer has two learners, both Random Forests with 250 base estimators.
2.   The second layer has two learners, both Random Forests with 250 base estimators.

the final estimator is a Logistic Regression


In [None]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier

# Create Learners per layer
layer_one_estimators = [
                        ('rf_1', RandomForestClassifier(n_estimators=250, random_state=42,class_weight='balanced')),
                        ('rf_2', RandomForestClassifier(n_estimators=250, random_state=42,class_weight='balanced')),         
                       ]
layer_two_estimators = [
                        ('rf_3',RandomForestClassifier(n_estimators=250, random_state=42,class_weight='balanced')),
                        ('rf_4', RandomForestClassifier(n_estimators=250, random_state=42,class_weight='balanced')),
                       ]
layer_two = StackingClassifier(estimators=layer_two_estimators, 
                               final_estimator=LogisticRegression(class_weight='balanced'))

# Create Final model by 
clf = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two)

clf.fit(X_train_pp,y_train)

In [None]:
threshold = find_treshold(clf,X_test_pp,y_test)
start_time = time.time()
y_train_pred = np.where(clf.predict_proba(X_train_pp)[:,0] > threshold, 0, 1)
y_test_pred = np.where(clf.predict_proba(X_test_pp)[:,0] > threshold, 0, 1)
prediction_time = round(time.time() - start_time,3)

In [None]:
p = pickle.dumps(clf)
model_size = sys.getsizeof(p)/(1024**2)

accuracy_train = round(accuracy_score(y_train, y_train_pred),3)
precision_train = round(precision_score(y_train, y_train_pred),3)
recall_train = round(recall_score(y_train, y_train_pred),3)
f1_score_train = round(f1_score(y_train, y_train_pred),3)
roc_auc_train = round(roc_auc_score(y_train, y_train_pred),3)
pct_positive_train = round((sum(y_train_pred)/len(y_train_pred))*100,3)

accuracy_test = round(accuracy_score(y_test, y_test_pred),3)
precision_test = round(precision_score(y_test, y_test_pred),3)
recall_test = round(recall_score(y_test, y_test_pred),3)
f1_score_test = round(f1_score(y_test, y_test_pred),3)
roc_auc_test = round(roc_auc_score(y_test, y_test_pred),3)
pct_positive_test = round((sum(y_test_pred)/len(y_test_pred))*100,3)

perform_dict['Stacking Classifier'] = {'Training':{'Accuracy':accuracy_train,
                                                   'Recall':recall_train,
                                              'Precision':precision_train,
                                              'F1-Score':f1_score_train,
                                              'ROC AUC':roc_auc_train,
                                              'Pct. Positive':pct_positive_train},
                                  'Test':{'Accuracy':accuracy_test,
                                          'Recall':recall_test,
                                          'Precision':precision_test,
                                          'F1-Score':f1_score_test,
                                          'ROC AUC':roc_auc_test,
                                          'Pct. Positive':pct_positive_test},
                                  'Model':{'Prediction Time':prediction_time,
                                           'Model Size':model_size}}

print(f'Model Prediction Time: {prediction_time} Seconds')
if model_size > 1:
  print(f'Model Size: {round(model_size,3)} MB')
else:
  print(f'Model Size: {round(model_size*1024)} KB')

print(f'We are classifying {pct_positive_test}% of the test observations as positive.\n')

print(f'Test Accuracy: {accuracy_test}, Test F1 Score: {f1_score_test}')
print(f'Test Precision: {precision_test}, Test Recall: {recall_test}')
print(f'Test Area Under the ROC Curve: {roc_auc_test}\n')

print(f'We are classifying {pct_positive_train}% of the training observations as positive.\n')

print(f'Train Accuracy: {accuracy_train}, Train F1 Score: {f1_score_train}')
print(f'Train Precision: {precision_train}, Train Recall: {recall_train}')

print(f'Train Area Under the ROC Curve: {roc_auc_train}')

## What About Neural Networks?
The final competitor I have considered is a 4-layer neural networks with dropout applied to each output and ReLU activation functions.

In [None]:
# Import torch, torchvision libraries
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Import numpy for some computation
import numpy as np

# Import matplotlib for plotting
import matplotlib.pyplot as plt
 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
import math

def init_weights(net):
    """
    Usage: net = Model()
           net.apply(init_weights)
    """
    for m in net.modules():
        if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight)
        elif isinstance(m, nn.BatchNorm2d):
            nn.init.ones_(m.weight)
            nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                stdv = 1. / math.sqrt(m.weight.size(1))
                nn.init.uniform_(m.bias, -stdv, stdv)


In [None]:
class Net(nn.Module):

    def __init__(self,input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, input_size) 
        self.fc2 = nn.Linear(input_size, 50)
        self.fc3 = nn.Linear(50, 25)
        self.fc4 = nn.Linear(25, 2)
        self.soft = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        x = self.soft(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


model = Net(input_size = X_train_pp.shape[1])

In [None]:
from torch.utils.data import TensorDataset, DataLoader

tensor_x_train = torch.Tensor(np.array(X_train_pp)) # transform to torch tensor
tensor_y_train = torch.Tensor(np.array(y_train))
tensor_y_train = tensor_y_train.type(torch.LongTensor)

my_train_dataset = TensorDataset(tensor_x_train,tensor_y_train) # create your datset

tensor_x_test = torch.Tensor(np.array(X_test_pp)) # transform to torch tensor
tensor_y_test = torch.Tensor(np.array(y_test))
tensor_y_test = tensor_y_test.type(torch.LongTensor)

my_test_dataset = TensorDataset(tensor_x_test,tensor_y_test) # create your dataset

In [None]:
def load_data(data, batch_size=600):
    return DataLoader(data, batch_size=batch_size, shuffle=True)

In [None]:
import torch.optim as optim

def train(args, model, data):
    """
    @Brief: training your model. This should include the following items:
        - Initialize the model (already given). Only need to map the model to the device on which you would want to run the model on 
                using the following syntax: 
                model = model.to(device) 
                where device = torch.device(<device_name>), 
                i.e: device = torch.device("cuda:0") or device = torech.device("cpu")
                    
        - Initialize data loaders (you need to code up)
        - Initialize the optimizer (you need to code up. Type is of your choice)
        - Initialize the loss function (you should have coded up above)
        - A for loop to iterate through many epochs (up to your choice). In each epoch:
                - Iterate through every mini-batches (remember to map data and labels to the device that you would want to run the model on)
                        - Run the forward path
                        - Get loss
                        - Calculate gradients 
                        - Update the model's parameters
                - Evaluate your model on the validation set
                - Save the model if the performance on the validation set is better using exactly the following line:
                        save_model(model, model_name) 
                 
    @Inputs: 
        Args: object of your choice to carry arguments that you want to use within your training function. 
    @Output: 
        No return is necessary here. 
    """

    init_weights(model)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device) 

    # Initialize data loaders 
                                     
    trainloader = load_data(data)

    # Initialize the optimizer

    criterion = torch.nn.CrossEntropyLoss(reduction='mean', weight=args.weights)
    #optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum)
    
    for epoch in range(args.epochs):  # loop over the dataset multiple times

      running_loss = 0.0
      for i, data in enumerate(trainloader, 0):
          # get the inputs; data is a list of [inputs, labels]
          inputs, labels = data
          inputs = inputs.to(device)
          labels = labels.to(device)

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          inputs = inputs.float()
          outputs = model(inputs)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()

          # print statistics
          running_loss += loss.item()
          if i%50 == 49:    # print every 5 mini-batches
              print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 50))
              running_loss = 0.0

    print('Finished Training')


In [None]:
class Args(object):
    def __init__(self,learning_rate=0.01,epochs=500,momentum=None,weights=None):
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.epochs = epochs
        self.weights = weights

# We set weights inversely proportional to the proportion of each class

prop_positive = sum(y_train)/len(y_train)
weights = torch.tensor([1/(1-prop_positive),1/prop_positive]).to(device)
args = Args(learning_rate=0.015, epochs=500, momentum=0.9, weights=weights)

In [None]:
train(args, model=model,data=my_train_dataset)
model.eval() # Set the model in evaluation mode once it is trained

In [None]:
threshold = find_treshold(model,tensor_x_test,y_test,is_nn=True)
start_time = time.time()
test_scores = model(tensor_x_test.to(device))
train_scores = model(tensor_x_train.to(device))
y_test_pred = torch.where(test_scores[:,0] > threshold, 0, 1).cpu().detach().numpy()
y_train_test = torch.where(train_scores[:,0] > threshold, 0, 1).cpu().detach().numpy()
prediction_time = round(time.time() - start_time,3)

In [None]:
from torchsummary import summary
summary(model,input_size=(1,144))

In [None]:
p = pickle.dumps(model)
model_size = sys.getsizeof(p)/(1024**2)

accuracy_train = round(accuracy_score(y_train, y_train_pred),3)
precision_train = round(precision_score(y_train, y_train_pred),3)
recall_train = round(recall_score(y_train, y_train_pred),3)
f1_score_train = round(f1_score(y_train, y_train_pred),3)
roc_auc_train = round(roc_auc_score(y_train, y_train_pred),3)
pct_positive_train = round((sum(y_train_pred)/len(y_train_pred))*100,3)

accuracy_test = round(accuracy_score(y_test, y_test_pred),3)
precision_test = round(precision_score(y_test, y_test_pred),3)
recall_test = round(recall_score(y_test, y_test_pred),3)
f1_score_test = round(f1_score(y_test, y_test_pred),3)
roc_auc_test = round(roc_auc_score(y_test, y_test_pred),3)
pct_positive_test = round((sum(y_test_pred)/len(y_test_pred))*100,3)

perform_dict['Neural Network'] = {'Training':{'Accuracy':accuracy_train,
                                              'Recall':recall_train,
                                              'Precision':precision_train,
                                              'F1-Score':f1_score_train,
                                              'ROC AUC':roc_auc_train,
                                              'Pct. Positive':pct_positive_train},
                                  'Test':{'Accuracy':accuracy_test,
                                          'Recall':recall_test,
                                          'Precision':precision_test,
                                          'F1-Score':f1_score_test,
                                          'ROC AUC':roc_auc_test,
                                          'Pct. Positive':pct_positive_test},
                                  'Model':{'Prediction Time':prediction_time,
                                           'Model Size':model_size}}

print(f'Model Prediction Time: {prediction_time} Seconds')
if model_size > 1:
  print(f'Model Size: {round(model_size,3)} MB')
else:
  print(f'Model Size: {round(model_size*1024)} KB')

print(f'We are classifying {pct_positive_test}% of the test observations as positive.\n')

print(f'Test Accuracy: {accuracy_test}, Test F1 Score: {f1_score_test}')
print(f'Test Precision: {precision_test}, Test Recall: {recall_test}')
print(f'Test Area Under the ROC Curve: {roc_auc_test}\n')

print(f'We are classifying {pct_positive_train}% of the training observations as positive.\n')

print(f'Train Accuracy: {accuracy_train}, Train F1 Score: {f1_score_train}')
print(f'Train Precision: {precision_train}, Train Recall: {recall_train}')

print(f'Train Area Under the ROC Curve: {roc_auc_train}')

It is finally time to write the dictionary to a json file.

In [None]:
import json

with open('performance.json', 'w') as fp:
    json.dump(perform_dict, fp)