In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline
import plotly
import plotly.figure_factory as ff
from plotly.offline import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix


pd.set_option('display.max_columns', 300)

In [46]:
demo_features = ['X2SEX', 'X2RACE', 'X2DUALLANG', 'X2POVERTY185', 'X2SESQ5_U', 'X2CONTROL', 'X2LOCALE', 'X2REGION']


mvp_features = ['X2STU30OCC_STEM1', 'X2STUEDEXPCT', 'X2S2SSPR12', 'S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 
               'S2SSPR12', 'S2LIFES12', 'S2BIO1S12', 'S2BIO2S12', 'S2APBIOS12', 
               'S2IBIOS12', 'S2ANATOMYS12', 'S2OTHBIOS12', 'S2CHEM1S12', 'S2CHEM2S12', 'S2APCHEM12', 
               'S2IBCHEM12', 'S2EARTHS12', 'S2APENVS12', 'S2OTHENVS12', 'S2PHYSIC1S12', 
               'S2PHYSIC2S12', 'S2APPHYSIC12', 'S2IBPHYSIC12', 'S2PHYSS12', 'S2TECHS12', 'S2OTHPHYS12', 
               'S2INTGS1S12', 'S2INTGS2S12', 'S2GENS12', 'S2COMPAPP12', 'S2COMPPROG12', 
               'S2APCOMPSCI12', 'S2IBTECH12', 'S2OTHCOMP12', 'S2ENGINEER12', 'S2OTHS12', 'S2OTHS12SP', 
               'S2HISCIENCE12', 'S2APSCIENCE', 'S2IBSCIENCE', 'S2STOOKBEFORE', 'S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED', 'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 
               'S2SASSEXCL', 'S2APSCIENCE', 'S2HSPLAN', 'S2SUBMITPLAN', 'S2SCLUB', 
            'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED', 'X4RFDGMJ123', 'X4RFDGMJSTEM']

family_features = ['X2PAR1EDU', 'X2PAR1OCC_STEM1', 
            'X2PAR1RACE', 'X2PAR2EDU', 'X2PAR2OCC_STEM1', 'X2PAR2RACE', 'X2PARPATTERN', 
            'X2MOMEDU', 'X2MOMOCC_STEM1', 'X2MOMRACE', 'X2DADEDU', 'X2DADOCC_STEM1', 
            'X2DADRACE']

cols_list = demo_features + mvp_features + family_features



In [47]:
df = pd.read_csv('../data-files/HSLS/hsls_17_student_pets_sr_v1_0.csv', usecols = cols_list)
df.rename(columns = {'X4RFDGMJSTEM': 'target'}, inplace = True)

In [48]:
#drop rows with non-response to S2SLEARN (and many other features) (likely dropped from study)
df = df[df['S2SLEARN'] != -8]

#create dummy variables for races
df['ai_an'] = np.where(df['X2RACE'] == 1, 1, 0)
df['asian'] = np.where(df['X2RACE'] == 2, 1, 0)
df['black'] = np.where(df['X2RACE'] == 3, 1, 0)
df['hispanic'] = np.where((df['X2RACE'] == 4) | (df['X2RACE'] == 5), 1, 0)
df['multiple_race'] = np.where(df['X2RACE'] == 6, 1, 0)
df['nh_pi'] = np.where(df['X2RACE'] == 7, 1, 0)
df['white'] = np.where(df['X2RACE'] == 8, 1, 0)

df['female'] = np.where(df['X2SEX'] == 2, 1, 0)

In [49]:
#'X2DUALLANG', 'X2POVERTY185', 'X2SESQ5_U', 'X2CONTROL', 'X2LOCALE', 'X2REGION'

#create dummy for public/private school
df['private'] = [1 if x == 2 else 0 for x in df['X2CONTROL']]
df['public'] = [1 if x == 1 else 0 for x in df['X2CONTROL']]

In [50]:
#compile all subchoices of STEM domains into yes/no
df.X2STU30OCC_STEM1.replace({-9:0, 9:0, 4:1, 5:1, 6:1}, inplace = True)


In [51]:
classes = ['S2SSPR12', 'S2LIFES12', 'S2BIO1S12', 'S2BIO2S12', 'S2APBIOS12', 
               'S2IBIOS12', 'S2ANATOMYS12', 'S2OTHBIOS12', 'S2CHEM1S12', 'S2CHEM2S12', 'S2APCHEM12', 
               'S2IBCHEM12', 'S2EARTHS12', 'S2APENVS12', 'S2OTHENVS12', 'S2PHYSIC1S12', 
               'S2PHYSIC2S12', 'S2APPHYSIC12', 'S2IBPHYSIC12', 'S2PHYSS12', 'S2TECHS12', 'S2OTHPHYS12', 
               'S2INTGS1S12', 'S2INTGS2S12', 'S2GENS12', 'S2COMPAPP12', 'S2COMPPROG12', 
               'S2APCOMPSCI12', 'S2IBTECH12', 'S2OTHCOMP12', 'S2ENGINEER12', 'S2OTHS12', 'S2APSCIENCE', 'S2IBSCIENCE']

#impute 'no' for items that are missing or were skipped due to not taking a science class
for col in classes:
    df[col].replace({-9:0, -7:0}, inplace= True)


In [52]:
df.S2SSPR12.value_counts()

1    16192
0     4402
Name: S2SSPR12, dtype: int64

In [53]:
#impute unknown with 'no' for if participating in science activity

clubs_cols = ['S2SCLUB', 'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED']

for col in clubs_cols:
    df[col].replace({-9:0}, inplace = True)

In [54]:
df.S2HSPLAN = np.where(df['S2HSPLAN'] == 1, 1, 0)
df.S2SUBMITPLAN = np.where(df['S2SUBMITPLAN'] == 1, 1, 0)

In [55]:
#create class for underrepresented group in STEM (women, black, american indian, hispanic, pacific islander)

df['underrep'] = np.where((df['X2SEX'] == 2) |
                          (df['ai_an'] == 1) |
                          (df['black'] == 1) |
                          (df['hispanic'] == 1) |
                          (df['multiple_race'] == 1) |
                          (df['nh_pi'] == 1), 1, 0)
                         

In [56]:
#group HS science classes into broader subjects

df['bio'] = np.where((df['S2LIFES12'] == 1) |
                      (df['S2BIO1S12'] == 1) |
                      (df['S2BIO2S12'] == 1) |
                      (df['S2APBIOS12'] == 1) |
                      (df['S2IBIOS12'] == 1) |
                     (df['S2ANATOMYS12'] == 1) |
                      (df['S2OTHBIOS12'] == 1), 1, 0)
               

df['chem'] = np.where((df['S2CHEM1S12'] == 1) |
                      (df['S2CHEM2S12'] == 1) |
                      (df['S2APCHEM12'] == 1) |
                      (df['S2IBCHEM12'] == 1), 1, 0)
              

df['enviro'] = np.where((df['S2EARTHS12'] == 1) |
                        (df['S2EARTHS12'] == 1) |
                        (df['S2APENVS12'] == 1) |
                        (df['S2OTHENVS12'] == 1), 1, 0)
                        
df['physics'] = np.where((df['S2PHYSIC1S12'] == 1) |
                         (df['S2PHYSIC2S12'] == 1) |
                         (df['S2APPHYSIC12'] == 1) |
                         (df['S2IBPHYSIC12'] == 1) |
                         (df['S2PHYSS12'] == 1), 1, 0)
                         
df['engineering'] = np.where((df['S2ENGINEER12'] == 1), 1, 0)


df['compsci'] = np.where((df['S2COMPAPP12'] == 1) |
                         (df['S2COMPPROG12'] == 1) |
                         (df['S2APCOMPSCI12'] == 1) |
                         (df['S2IBTECH12'] == 1) |
                         (df['S2OTHCOMP12'] == 1), 1, 0)

df['misc_class'] = np.where((df['S2OTHPHYS12'] == 1) |
                            (df['S2INTGS1S12'] == 1) |
                            (df['S2GENS12'] == 1), 1, 0)


In [57]:
#create column for students who took science earlier in the year (but don't now)
df['took_science_2012'] = np.where((df['S2STOOKBEFORE'] == 1) |
                                   (df['bio'] == 1) |
                                   (df['chem'] == 1) |
                                   (df['enviro'] == 1) |
                                   (df['physics'] == 1) |
                                   (df['engineering'] == 1) |
                                   (df['compsci'] == 1) |
                                   (df['misc_class'] == 1), 1, 0)


In [58]:
#impute 'no' for items that are missing or were skipped due to not taking a science class

why_science = ['S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED']

for col in why_science:
    df[col].replace({-9:0, -7:0}, inplace= True)

In [59]:
df.X2PAR1OCC_STEM1.value_counts()
df.X2PAR1OCC_STEM1.replace({0:0, -7:0, 1:1, 4:1, 5:1, 6:1}, inplace = True)
df.X2PAR2OCC_STEM1.replace({0:0, -7:0, 1:1, 4:1, 5:1, 6:1}, inplace = True)
df['parent_stem'] = np.where((df.X2PAR2OCC_STEM1 == 1) | (df.X2PAR1OCC_STEM1 == 1), 1, 0)

In [60]:
df.X2PAR2EDU.replace({1:0, 2:1, 3:1, 4:1, 5:2, 6:3, 7:3, -8:0}, inplace = True)
df.X2PAR1EDU.replace({1:0, 2:1, 3:1, 4:1, 5:2, 6:3, 7:3, -8:0}, inplace = True)

df['parent_hs'] = np.where((df.X2PAR1EDU == 1) | (df.X2PAR2EDU == 1), 1, 0)
df['parent_bach'] = np.where((df.X2PAR1EDU == 2) | (df.X2PAR2EDU == 2), 1, 0)
df['parent_adv'] = np.where((df.X2PAR1EDU == 3) | (df.X2PAR2EDU == 3), 1, 0)

In [61]:
#create dataframe with target variable
modeling_df = df[(df.target == 0) | (df.target == 1)]



In [65]:
likert_cols = ['S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 
                'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 
               'S2SASSEXCL']

#change likert questions to agree/disagree
for col in likert_cols:
   modeling_df[col].replace({2:1, 3:0, 4:0, -9:0, -7:0}, inplace = True)

## Female Only - LogReg Gridsearch

In [66]:
features = ['S2SCLUB', 'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED', 'S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP',
               'S2SASSEXCL', 'S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED', 'X2STU30OCC_STEM1', 'took_science_2012', 'bio', 'chem', 'enviro', 'physics', 'engineering', 'compsci', 'misc_class',
           'public', 'black', 'hispanic', 'asian', 'ai_an', 'multiple_race', 'nh_pi' , 'X2POVERTY185', 'parent_hs', 'parent_bach', 
           'parent_adv', 'parent_stem']

female_df = modeling_df[modeling_df['female'] == 1]
X = female_df.drop(columns = 'target', axis = 1)
y = female_df.target
X = X[features]

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

In [68]:
clf = LogisticRegression()

param_grid = {
    
    'max_iter': [75, 100, 250, 500, 1000],
    'C': [0.0001, 0.001, 0.01, 1, 100, 1000, 5000], 
    'class_weight': [None, 'balanced', {1:2, 0:1}, {1:10, 0:1}]
}
gs_female = GridSearchCV(clf, param_grid, cv=5, scoring = 'f1', n_jobs = -1, verbose = 2)
gs_female.fit(X_train, y_train)

gs_female.best_params_

Fitting 5 folds for each of 140 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 581 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   45.5s finished


{'C': 1, 'class_weight': 'balanced', 'max_iter': 75}

In [69]:
lr_female = LogisticRegression(max_iter = 75, C = 1, random_state = 15, class_weight = 'balanced')
lr_female.fit(X_train, y_train)

lr_female_train_pred = lr_female.predict(X_train)
lr_female_test_pred = lr_female.predict(X_test)

In [70]:
metric_dict = {}

metric_dict['Female_LogisticRegression'] = {'train_accuracy': metrics.accuracy_score(y_train, lr_female_train_pred),
                                      'test_accuracy': metrics.accuracy_score(y_test, lr_female_test_pred),
                                      'train_precision':metrics.precision_score(y_train, lr_female_train_pred),
                                      'test_precision':metrics.precision_score(y_test, lr_female_test_pred),
                                      'train_recall':metrics.recall_score(y_train, lr_female_train_pred),
                                      'test_recall':metrics.recall_score(y_test, lr_female_test_pred),
                                      'train_f1':metrics.f1_score(y_train, lr_female_train_pred),
                                      'test_f1':metrics.f1_score(y_test, lr_female_test_pred)}

In [71]:
metric_dict

{'Female_LogisticRegression': {'train_accuracy': 0.714582899937591,
  'test_accuracy': 0.6880199667221298,
  'train_precision': 0.31923764145324596,
  'test_precision': 0.2677725118483412,
  'train_recall': 0.7006535947712418,
  'test_recall': 0.6312849162011173,
  'train_f1': 0.43862520458265136,
  'test_f1': 0.3760399334442595}}

In [72]:
coef_female = dict(zip(list(features), list(lr_female.coef_[0])))
sorted_dict = sorted(coef_female.items(), key=lambda kv: kv[1])
sorted_dict.reverse()

In [73]:
sorted_dict

[('engineering', 1.1124337392923342),
 ('asian', 0.9778224697536554),
 ('physics', 0.7218370178487042),
 ('S2SCAREER', 0.4831400139974168),
 ('S2SCLUB', 0.4714230119362303),
 ('compsci', 0.4418433279506661),
 ('parent_adv', 0.4330536912127937),
 ('S2SPERSON1', 0.4205770609537247),
 ('X2STU30OCC_STEM1', 0.36359975201278927),
 ('S2SENJOYS', 0.3528284637169755),
 ('S2SSUMMERPRG', 0.34640087669882225),
 ('multiple_race', 0.32053263120627573),
 ('parent_bach', 0.3094320866793575),
 ('S2STEXTBOOK', 0.2928893394640343),
 ('S2SPERSON2', 0.29096698730324116),
 ('chem', 0.2850982435586292),
 ('hispanic', 0.26201757238836115),
 ('S2STCHGIVEUP', 0.22505879958342348),
 ('nh_pi', 0.22447163491678776),
 ('ai_an', 0.2019526189922496),
 ('S2SCOMPETE', 0.2006303666742882),
 ('S2SDOWELL', 0.17018162615675814),
 ('S2SFAMREC', 0.16414558168787471),
 ('black', 0.16326100212003727),
 ('S2SUSEJOB', 0.15467376130810812),
 ('public', 0.148474262698256),
 ('S2STCHEASY', 0.10053571946267935),
 ('enviro', 0.087292

### K Best to Address Overfitting

In [91]:
selector = SelectKBest(f_regression, k = 10)
selector.fit(X_train, y_train)
selected_columns = X_train.columns[selector.get_support()]
selected_columns

Index(['S2SCLUB', 'S2SCOMPETE', 'S2SPERSON1', 'S2SPERSON2', 'S2SENJOYS',
       'S2SCHALLENGE', 'S2SCAREER', 'S2SDOWELL', 'X2STU30OCC_STEM1', 'asian'],
      dtype='object')

In [93]:
clf = LogisticRegression()

param_grid = {
    
    'max_iter': [75, 100, 250, 500, 1000],
    'C': [0.0001, 0.001, 0.01, 1, 100, 1000, 5000], 
    'class_weight': [None, 'balanced', {1:2, 0:1}, {1:10, 0:1}]
}
gs_kbfemale = GridSearchCV(clf, param_grid, cv=5, scoring = 'f1', n_jobs = -1, verbose = 2)
gs_kbfemale.fit(X_train[selected_columns], y_train)

gs_kbfemale.best_params_

Fitting 5 folds for each of 140 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 492 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   14.6s finished


{'C': 1, 'class_weight': 'balanced', 'max_iter': 75}

In [94]:
lr_kbest = LogisticRegression(max_iter = 75, C = 1, class_weight = 'balanced', random_state =15)

lr_kbest.fit(X_train[selected_columns], y_train)
kbest_train_pred = lr_kbest.predict(X_train[selected_columns])
kbest_test_pred= lr_kbest.predict(X_test[selected_columns])

In [95]:
metric_dict['KBEST_LogisticRegression'] = {'train_accuracy': metrics.accuracy_score(y_train, kbest_train_pred),
                                      'test_accuracy': metrics.accuracy_score(y_test, kbest_test_pred),
                                      'train_precision':metrics.precision_score(y_train, kbest_train_pred),
                                      'test_precision':metrics.precision_score(y_test, kbest_test_pred),
                                      'train_recall':metrics.recall_score(y_train, kbest_train_pred),
                                      'test_recall':metrics.recall_score(y_test, kbest_test_pred),
                                      'train_f1':metrics.f1_score(y_train, kbest_train_pred),
                                      'test_f1':metrics.f1_score(y_test, kbest_test_pred)}

In [96]:
metric_dict

{'Female_LogisticRegression': {'train_accuracy': 0.714582899937591,
  'test_accuracy': 0.6880199667221298,
  'train_precision': 0.31923764145324596,
  'test_precision': 0.2677725118483412,
  'train_recall': 0.7006535947712418,
  'test_recall': 0.6312849162011173,
  'train_f1': 0.43862520458265136,
  'test_f1': 0.3760399334442595},
 'KBEST_LogisticRegression': {'train_accuracy': 0.6971083836072395,
  'test_accuracy': 0.6780366056572379,
  'train_precision': 0.30132259919493964,
  'test_precision': 0.2647058823529412,
  'train_recall': 0.6849673202614379,
  'test_recall': 0.6536312849162011,
  'train_f1': 0.4185303514376997,
  'test_f1': 0.3768115942028986}}