In [1]:
import cf1 as cf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline
import plotly
import plotly.figure_factory as ff
from plotly.offline import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix


pd.set_option('display.max_columns', 300)

In [6]:
df = cf.import_it('../data-files/HSLS/hsls_17_student_pets_sr_v1_0.csv')
df = cf.clean_it(df)

In [7]:
df = df[(df.target == 1) | (df.target == 0)]

In [71]:
#create df for subgroups

female = df[df.female == 1]
black = df[df.black == 1]
hispanic = df[df.hispanic == 1]

#select features for modeling
features = ['S2SCLUB', 'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED', 'S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP',
               'S2SASSEXCL', 'S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED', 'X2STU30OCC_STEM1', 'took_science_2012',  
            'bio', 'chem', 'enviro', 'physics', 'engineering', 'compsci', 'misc_class', 'public', 'black', 'hispanic', 'asian', 'ai_an', 'multiple_race', 'nh_pi' , 'X2POVERTY185', 'underrep']

In [12]:
female.shape

(6009, 110)

In [13]:
black.shape

(1004, 110)

In [14]:
hispanic.shape

(1486, 110)

### Female Students

In [72]:
#train/test split
X = female.drop(columns = 'target', axis =1)
X = X[features]
y = female['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

In [73]:
#run baseline
lr_female_base = LogisticRegression(max_iter = 10000, random_state = 20)

lr_female_base.fit(X_train, y_train)

trainpred_lr_female_base = lr_female_base.predict(X_train)
testpred_lr_female_base = lr_female_base.predict(X_test)


In [74]:
female_dict = {}
female_dict['Initial_LogisticRegression'] = {'train_accuracy': metrics.accuracy_score(y_train, trainpred_lr_female_base),
                                      'test_accuracy': metrics.accuracy_score(y_test, testpred_lr_female_base),
                                      'train_precision':metrics.precision_score(y_train, trainpred_lr_female_base),
                                      'test_precision':metrics.precision_score(y_test, testpred_lr_female_base),
                                      'train_recall':metrics.recall_score(y_train, trainpred_lr_female_base),
                                      'test_recall':metrics.recall_score(y_test, testpred_lr_female_base),
                                      'train_f1':metrics.f1_score(y_train, trainpred_lr_female_base),
                                      'test_f1':metrics.f1_score(y_test, testpred_lr_female_base) 
                                          }

female_dict['Initial_LogisticRegression']

{'train_accuracy': 0.8508425213230705,
 'test_accuracy': 0.8469217970049917,
 'train_precision': 0.6132075471698113,
 'test_precision': 0.44680851063829785,
 'train_recall': 0.16993464052287582,
 'test_recall': 0.11731843575418995,
 'train_f1': 0.26612077789150457,
 'test_f1': 0.18584070796460175}

In [75]:
##run gridsearch
#instantiate classifier
clf = LogisticRegression()

#select hyperparameter values to test
param_grid = {
    
    'max_iter': [125, 130, 135, 150],
    'C': [0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73], 
    'class_weight': [None, 'balanced']
}

#run gridsearch with hyperparamters
gs_LR = GridSearchCV(clf, param_grid, cv=5, scoring = 'f1', n_jobs = -1, verbose = 2)
#fit the model to the train set
gs_LR.fit(X_train, y_train)
#get the best parameters
gs_LR.best_params_

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   47.4s finished


{'C': 0.69, 'class_weight': 'balanced', 'max_iter': 125}

In [76]:
lr_female_gs = LogisticRegression(C = 0.66, class_weight = 'balanced', max_iter = 125, random_state = 20)

lr_female_gs.fit(X_train, y_train)

trainpred_lr_female_gs = lr_female_gs.predict(X_train)
testpred_lr_female_gs = lr_female_gs.predict(X_test)

In [77]:
female_dict['GS_LogisticRegression'] = {'train_accuracy': metrics.accuracy_score(y_train, trainpred_lr_female_gs),
                                      'test_accuracy': metrics.accuracy_score(y_test, testpred_lr_female_gs),
                                      'train_precision':metrics.precision_score(y_train, trainpred_lr_female_gs),
                                      'test_precision':metrics.precision_score(y_test, testpred_lr_female_gs),
                                      'train_recall':metrics.recall_score(y_train, trainpred_lr_female_gs),
                                      'test_recall':metrics.recall_score(y_test, testpred_lr_female_gs),
                                      'train_f1':metrics.f1_score(y_train, trainpred_lr_female_gs),
                                      'test_f1':metrics.f1_score(y_test, testpred_lr_female_gs) 
                                          }

female_dict['GS_LogisticRegression']

{'train_accuracy': 0.7079259413355523,
 'test_accuracy': 0.6788685524126455,
 'train_precision': 0.3117265763111373,
 'test_precision': 0.26096997690531176,
 'train_recall': 0.6915032679738562,
 'test_recall': 0.6312849162011173,
 'train_f1': 0.42973192526401294,
 'test_f1': 0.369281045751634}

In [79]:
coefs = dict(zip(list(features), list(lr_female_gs.coef_[0])))
sorted_coef = sorted(coefs.items(), key=lambda kv: kv[1])
sorted_coef.reverse()
#coef_df = pd.DataFrame(data = sorted_dict, columns = ['feature', 'coefficient_value'])
#coef_df
sorted_coef

[('engineering', 1.0921259279641178),
 ('asian', 1.0350069495321625),
 ('physics', 0.7477033939041822),
 ('S2SCAREER', 0.4880740039669265),
 ('S2SCLUB', 0.48748480013492546),
 ('compsci', 0.44757604933703893),
 ('S2SPERSON1', 0.43722716575541737),
 ('S2SSUMMERPRG', 0.40270322618349214),
 ('S2SENJOYS', 0.3425651970380277),
 ('X2STU30OCC_STEM1', 0.3375336927920853),
 ('multiple_race', 0.3099931931337489),
 ('S2SPERSON2', 0.30600932298123906),
 ('chem', 0.2965243159492922),
 ('S2STEXTBOOK', 0.2944854104725567),
 ('nh_pi', 0.2383664076102029),
 ('S2STCHGIVEUP', 0.23108574880541433),
 ('ai_an', 0.20943622655215677),
 ('hispanic', 0.1984329615299881),
 ('S2SCOMPETE', 0.18978185383243323),
 ('S2SDOWELL', 0.17892702903591928),
 ('S2SUSEJOB', 0.15970566991865495),
 ('S2SFAMREC', 0.14455111631003317),
 ('S2SPARREC', 0.10411029282559807),
 ('black', 0.08988487710295782),
 ('S2STCHEASY', 0.08632086305103095),
 ('enviro', 0.07812139953453993),
 ('S2STESTS', 0.06274155292941234),
 ('bio', 0.05961097

In [80]:
female.S2SEMPREC.value_counts()

0    5865
1     144
Name: S2SEMPREC, dtype: int64

#### Decision Tree

In [52]:
#instantiate classifier
dtc_gs = DecisionTreeClassifier(random_state = 20)

#create parameters for the gridsearch to search through
param_dict={'max_depth': range(3,8),
            'criterion': ['gini', 'entropy'],
            'splitter': ['random', 'best'],
            'max_features': ['auto', 'sqrt', 'log2']}

#create the gridsearch decision tree and fit it to the data to determine which will produce the best f1 score
grid_tree=GridSearchCV(dtc_gs, 
                       param_dict, 
                       cv=10, 
                       scoring='f1', 
                       verbose=1, 
                       n_jobs=-1)

grid_tree.fit(X_train,y_train)



Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   16.5s finished


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=20),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(3, 8),
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['random', 'best']},
             scoring='f1', verbose=1)

In [56]:
grid_tree.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=6, max_features='auto',
                       random_state=20)

In [57]:
pred_dtcgs_test = grid_tree.best_estimator_.predict(X_test)
pred_dtcgs_train = grid_tree.best_estimator_.predict(X_train)

In [58]:
female_dict['GridSearch_DecisionTree'] = {'train_accuracy': metrics.accuracy_score(y_train, pred_dtcgs_train),
                                  'test_accuracy': metrics.accuracy_score(y_test, pred_dtcgs_test),
                                  'train_precision':metrics.precision_score(y_train, pred_dtcgs_train),
                                  'test_precision':metrics.precision_score(y_test, pred_dtcgs_test),
                                  'train_recall':metrics.recall_score(y_train, pred_dtcgs_train),
                                  'test_recall':metrics.recall_score(y_test, pred_dtcgs_test),
                                  'train_f1':metrics.f1_score(y_train, pred_dtcgs_train),
                                  'test_f1':metrics.f1_score(y_test, pred_dtcgs_test)}
female_dict['GridSearch_DecisionTree']

{'train_accuracy': 0.848346161847306,
 'test_accuracy': 0.8502495840266223,
 'train_precision': 0.6216216216216216,
 'test_precision': 0.4838709677419355,
 'train_recall': 0.12026143790849673,
 'test_recall': 0.08379888268156424,
 'train_f1': 0.2015334063526835,
 'test_f1': 0.14285714285714288}

In [136]:
from sklearn.ensemble import BaggingClassifier
ds = DecisionTreeClassifier(criterion='entropy',max_depth=6)
bag = BaggingClassifier(base_estimator = LogisticRegression(C = 0.66, class_weight = 'balanced', max_iter = 125, random_state = 20), max_samples=1.0,bootstrap=True, n_estimators = 5)
bag.fit(X_train, y_train)

BaggingClassifier(base_estimator=LogisticRegression(C=0.66,
                                                    class_weight='balanced',
                                                    max_iter=125,
                                                    random_state=20),
                  n_estimators=5)

In [137]:
pred_train = bag.predict(X_train)
pred_test = bag.predict(X_test)

In [138]:
metrics.f1_score(y_train, pred_train)

0.43424317617866004

In [139]:
metrics.f1_score(y_test, pred_test)

0.3733333333333333