In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline
import plotly
import plotly.figure_factory as ff
from plotly.offline import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier



pd.set_option('display.max_columns', 300)

In [7]:
demo_features = ['X2SEX', 'X2RACE', 'X2DUALLANG', 'X2POVERTY185', 'X2SESQ5_U', 'X2CONTROL', 'X2LOCALE', 'X2REGION']


mvp_features = ['X2STU30OCC_STEM1', 'X2STUEDEXPCT', 'X2S2SSPR12', 'S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 
               'S2SSPR12', 'S2LIFES12', 'S2BIO1S12', 'S2BIO2S12', 'S2APBIOS12', 
               'S2IBIOS12', 'S2ANATOMYS12', 'S2OTHBIOS12', 'S2CHEM1S12', 'S2CHEM2S12', 'S2APCHEM12', 
               'S2IBCHEM12', 'S2EARTHS12', 'S2APENVS12', 'S2OTHENVS12', 'S2PHYSIC1S12', 
               'S2PHYSIC2S12', 'S2APPHYSIC12', 'S2IBPHYSIC12', 'S2PHYSS12', 'S2TECHS12', 'S2OTHPHYS12', 
               'S2INTGS1S12', 'S2INTGS2S12', 'S2GENS12', 'S2COMPAPP12', 'S2COMPPROG12', 
               'S2APCOMPSCI12', 'S2IBTECH12', 'S2OTHCOMP12', 'S2ENGINEER12', 'S2OTHS12', 'S2OTHS12SP', 
               'S2HISCIENCE12', 'S2APSCIENCE', 'S2IBSCIENCE', 'S2STOOKBEFORE', 'S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED', 'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 
               'S2SASSEXCL', 'S2APSCIENCE', 'S2HSPLAN', 'S2SUBMITPLAN', 'S2SCLUB', 
            'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED', 'X4RFDGMJ123', 'X4RFDGMJSTEM']

family_features = ['X2PAR1EDU', 'X2PAR1OCC_STEM1', 
            'X2PAR1RACE', 'X2PAR2EDU', 'X2PAR2OCC_STEM1', 'X2PAR2RACE', 'X2PARPATTERN', 
            'X2MOMEDU', 'X2MOMOCC_STEM1', 'X2MOMRACE', 'X2DADEDU', 'X2DADOCC_STEM1', 
            'X2DADRACE']

cols_list = demo_features + mvp_features



In [8]:
df = pd.read_csv('data-files/HSLS/hsls_17_student_pets_sr_v1_0.csv', usecols = cols_list)
df.rename(columns = {'X4RFDGMJSTEM': 'target'}, inplace = True)

In [9]:
#drop rows with non-response to S2SLEARN (and many other features) (likely dropped from study)
df = df[df['S2SLEARN'] != -8]

#create dummy variables for races
df['ai_an'] = np.where(df['X2RACE'] == 1, 1, 0)
df['asian'] = np.where(df['X2RACE'] == 2, 1, 0)
df['black'] = np.where(df['X2RACE'] == 3, 1, 0)
df['hispanic'] = np.where((df['X2RACE'] == 4) | (df['X2RACE'] == 5), 1, 0)
df['multiple_race'] = np.where(df['X2RACE'] == 6, 1, 0)
df['nh_pi'] = np.where(df['X2RACE'] == 7, 1, 0)
df['white'] = np.where(df['X2RACE'] == 8, 1, 0)


In [10]:
#'X2DUALLANG', 'X2POVERTY185', 'X2SESQ5_U', 'X2CONTROL', 'X2LOCALE', 'X2REGION'

#create dummy for public/private school
df['private'] = [1 if x == 2 else 0 for x in df['X2CONTROL']]
df['public'] = [1 if x == 1 else 0 for x in df['X2CONTROL']]

In [11]:
#compile all subchoices of STEM domains into yes/no
df.X2STU30OCC_STEM1.replace({-9:0, 9:0, 4:1, 5:1, 6:1}, inplace = True)


In [12]:
classes = ['S2SSPR12', 'S2LIFES12', 'S2BIO1S12', 'S2BIO2S12', 'S2APBIOS12', 
               'S2IBIOS12', 'S2ANATOMYS12', 'S2OTHBIOS12', 'S2CHEM1S12', 'S2CHEM2S12', 'S2APCHEM12', 
               'S2IBCHEM12', 'S2EARTHS12', 'S2APENVS12', 'S2OTHENVS12', 'S2PHYSIC1S12', 
               'S2PHYSIC2S12', 'S2APPHYSIC12', 'S2IBPHYSIC12', 'S2PHYSS12', 'S2TECHS12', 'S2OTHPHYS12', 
               'S2INTGS1S12', 'S2INTGS2S12', 'S2GENS12', 'S2COMPAPP12', 'S2COMPPROG12', 
               'S2APCOMPSCI12', 'S2IBTECH12', 'S2OTHCOMP12', 'S2ENGINEER12', 'S2OTHS12', 'S2APSCIENCE', 'S2IBSCIENCE']

#impute 'no' for items that are missing or were skipped due to not taking a science class
for col in classes:
    df[col].replace({-9:0, -7:0}, inplace= True)


In [13]:
df.S2SSPR12.value_counts()

1    16192
0     4402
Name: S2SSPR12, dtype: int64

In [14]:
#impute unknown with 'no' for if participating in science activity

clubs_cols = ['S2SCLUB', 'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED']

for col in clubs_cols:
    df[col].replace({-9:0}, inplace = True)

In [15]:
df.S2HSPLAN = np.where(df['S2HSPLAN'] == 1, 1, 0)
df.S2SUBMITPLAN = np.where(df['S2SUBMITPLAN'] == 1, 1, 0)

In [16]:
#create class for underrepresented group in STEM (women, black, american indian, hispanic, pacific islander)

df['underrep'] = np.where((df['X2SEX'] == 2) |
                          (df['ai_an'] == 1) |
                          (df['black'] == 1) |
                          (df['hispanic'] == 1) |
                          (df['multiple_race'] == 1) |
                          (df['nh_pi'] == 1), 1, 0)
                         

In [17]:
#group HS science classes into broader subjects

df['bio'] = np.where((df['S2LIFES12'] == 1) |
                      (df['S2BIO1S12'] == 1) |
                      (df['S2BIO2S12'] == 1) |
                      (df['S2APBIOS12'] == 1) |
                      (df['S2IBIOS12'] == 1) |
                     (df['S2ANATOMYS12'] == 1) |
                      (df['S2OTHBIOS12'] == 1), 1, 0)
               

df['chem'] = np.where((df['S2CHEM1S12'] == 1) |
                      (df['S2CHEM2S12'] == 1) |
                      (df['S2APCHEM12'] == 1) |
                      (df['S2IBCHEM12'] == 1), 1, 0)
              

df['enviro'] = np.where((df['S2EARTHS12'] == 1) |
                        (df['S2EARTHS12'] == 1) |
                        (df['S2APENVS12'] == 1) |
                        (df['S2OTHENVS12'] == 1), 1, 0)
                        
df['physics'] = np.where((df['S2PHYSIC1S12'] == 1) |
                         (df['S2PHYSIC2S12'] == 1) |
                         (df['S2APPHYSIC12'] == 1) |
                         (df['S2IBPHYSIC12'] == 1) |
                         (df['S2PHYSS12'] == 1), 1, 0)
                         
df['engineering'] = np.where((df['S2ENGINEER12'] == 1), 1, 0)


df['compsci'] = np.where((df['S2COMPAPP12'] == 1) |
                         (df['S2COMPPROG12'] == 1) |
                         (df['S2APCOMPSCI12'] == 1) |
                         (df['S2IBTECH12'] == 1) |
                         (df['S2OTHCOMP12'] == 1), 1, 0)

df['misc_class'] = np.where((df['S2OTHPHYS12'] == 1) |
                            (df['S2INTGS1S12'] == 1) |
                            (df['S2GENS12'] == 1), 1, 0)


In [18]:
#create column for students who took science earlier in the year (but don't now)
df['took_science_2012'] = np.where((df['S2STOOKBEFORE'] == 1) |
                                   (df['bio'] == 1) |
                                   (df['chem'] == 1) |
                                   (df['enviro'] == 1) |
                                   (df['physics'] == 1) |
                                   (df['engineering'] == 1) |
                                   (df['compsci'] == 1) |
                                   (df['misc_class'] == 1), 1, 0)


In [19]:
df.took_science_2012.value_counts()

1    18051
0     2543
Name: took_science_2012, dtype: int64

In [20]:
#impute 'no' for items that are missing or were skipped due to not taking a science class

why_science = ['S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED']

for col in why_science:
    df[col].replace({-9:0, -7:0}, inplace= True)

In [21]:
#create dataframe with target variable
modeling_df = df[(df.target == 0) | (df.target == 1)]



In [50]:
likert_cols = ['S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 
                'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 
               'S2SASSEXCL']

#change likert questions to agree/disagree
for col in likert_cols:
   modeling_df[col].replace({2:1, 3:0, 4:0, -9:0, -7:0}, inplace = True)

In [51]:
modeling_df.S2SSPR12.value_counts()

1    9381
0    1593
Name: S2SSPR12, dtype: int64

In [52]:
modeling_df.groupby(by = ['target', 'underrep']).took_science_2012.value_counts(normalize = True)

target  underrep  took_science_2012
0       0         1                    0.913410
                  0                    0.086590
        1         1                    0.916964
                  0                    0.083036
1       0         1                    0.955943
                  0                    0.044057
        1         1                    0.945307
                  0                    0.054693
Name: took_science_2012, dtype: float64

In [53]:
modeling_df.underrep.value_counts(normalize = False)

1    7519
0    3455
Name: underrep, dtype: int64

In [54]:
underrep_df = modeling_df[modeling_df['underrep'] == 1]


### Initial Model - ALL

In [55]:
X = modeling_df.drop(columns = 'target', axis =1)
y = modeling_df['target']

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

In [122]:
features = ['S2SCLUB', 'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED', 'S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP',
               'S2SASSEXCL', 'S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED', 'X2STU30OCC_STEM1', 'took_science_2012', 'bio', 'chem', 'enviro', 'physics', 'engineering', 'compsci', 'misc_class'
            ]
                            

lr_fsm = LogisticRegression(max_iter = 10000, C = 1, random_state = 20, class_weight = 'balanced')
lr_fsm.fit(X_train[features], y_train)

train_pred = lr_fsm.predict(X_train[features])
test_pred = lr_fsm.predict(X_test[features])



In [123]:
coef = dict(zip(list(features), list(lr_fsm.coef_[0])))
sorted_dict1 = sorted(coef.items(), key=lambda kv: kv[1])
sorted_dict1.reverse()
sorted_dict1

[('engineering', 1.0530413965122318),
 ('compsci', 0.7398188694031352),
 ('S2SSUMMERPRG', 0.5111679102922543),
 ('physics', 0.4451942437440215),
 ('X2STU30OCC_STEM1', 0.44478214905452795),
 ('S2SPERSON1', 0.4059532227924043),
 ('S2SUSEJOB', 0.3946540478818828),
 ('S2SENJOYS', 0.3249237149135436),
 ('S2SCOMPETE', 0.2958431562285791),
 ('chem', 0.2870885687669366),
 ('S2SCLUB', 0.2729853820558432),
 ('S2SCAREER', 0.2674560475840533),
 ('S2SPERSON2', 0.24438135191053853),
 ('S2STESTS', 0.1365956994006197),
 ('S2STCHEASY', 0.131065316219976),
 ('S2STEXTBOOK', 0.12682369374079586),
 ('S2SFRIEND', 0.11924791095299672),
 ('S2SPARREC', 0.11896412632789195),
 ('S2SCHALLENGE', 0.06779376591578067),
 ('S2SDOWELL', 0.05687073303476584),
 ('S2SUSELIFE', 0.037533517361460024),
 ('S2SUSECLG', 0.03568733922924389),
 ('enviro', 0.03214016604979138),
 ('S2SLEARN', 0.026782502451821424),
 ('S2SCLGSUCC', 0.008262881259143559),
 ('S2SBORING', 0.006220766892793077),
 ('bio', 0.005634254225364532),
 ('S2SEMP

In [124]:
metric_dict = {}
metric_dict['LogisticRegression'] = {'train_accuracy': metrics.accuracy_score(y_train, train_pred),
                                      'test_accuracy': metrics.accuracy_score(y_test, test_pred),
                                      'train_precision':metrics.precision_score(y_train, train_pred),
                                      'test_precision':metrics.precision_score(y_test, test_pred),
                                      'train_recall':metrics.recall_score(y_train, train_pred),
                                      'test_recall':metrics.recall_score(y_test, test_pred),
                                      'train_f1':metrics.f1_score(y_train, train_pred),
                                      'test_f1':metrics.f1_score(y_test, test_pred)}

In [125]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.6668185442533319,
  'test_accuracy': 0.6706150341685649,
  'train_precision': 0.37962962962962965,
  'test_precision': 0.38261851015801357,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.48800980220549633,
  'test_f1': 0.48394004282655245}}

### Initial - Underrep

In [138]:
X_ur = underrep_df.drop(columns = 'target', axis =1)
y_ur = underrep_df['target']

In [145]:
Xu_train, Xu_test, yu_train, yu_test = train_test_split(X_ur, y_ur, test_size = 0.2, random_state = 20)

In [146]:
lr_ur_fsm = LogisticRegression(max_iter = 10000, C = 10000, random_state = 20, class_weight = 'balanced')
lr_ur_fsm.fit(Xu_train[features], yu_train)

train_ur_pred = lr_ur_fsm.predict(Xu_train[features])
test_ur_pred = lr_ur_fsm.predict(Xu_test[features])


In [147]:
coef = dict(zip(list(features), list(lr_ur_fsm.coef_[0])))
sorted_dict1 = sorted(coef.items(), key=lambda kv: kv[1])
sorted_dict1.reverse()
sorted_dict1

[('engineering', 0.9970508744933627),
 ('compsci', 0.5890869033359162),
 ('S2SSUMMERPRG', 0.5425614099769563),
 ('physics', 0.5020014594064346),
 ('S2SCLUB', 0.4283271338077947),
 ('X2STU30OCC_STEM1', 0.4007775311740715),
 ('S2SCAREER', 0.3766571887239843),
 ('S2SPERSON1', 0.35045965185882283),
 ('S2SPERSON2', 0.28312566061974115),
 ('S2SENJOYS', 0.2621139668490354),
 ('S2SCOMPETE', 0.24587950480382367),
 ('S2SUSELIFE', 0.23812154095380902),
 ('chem', 0.2066207546331234),
 ('S2SUSEJOB', 0.18605859374454345),
 ('S2STEXTBOOK', 0.18018923396336145),
 ('S2SCHALLENGE', 0.17155252634061954),
 ('S2STESTS', 0.09086802426314511),
 ('enviro', 0.057412915532880644),
 ('S2SDOWELL', 0.05620718364592987),
 ('S2SCLGSUCC', 0.05312050445947541),
 ('S2STCHGIVEUP', 0.041654780680799385),
 ('S2SBORN', 0.03617879800995662),
 ('S2SASSEXCL', 0.0315522564764834),
 ('bio', 0.03133814312344868),
 ('S2SPARREC', 0.02446124146812763),
 ('took_science_2012', 0.010177301131754652),
 ('S2SWASTE', 0.008421447665224294

In [142]:
metric_dict['URLogisticRegression'] = {'train_accuracy': metrics.accuracy_score(yu_train, train_ur_pred),
                                      'test_accuracy': metrics.accuracy_score(yu_test, test_ur_pred),
                                      'train_precision':metrics.precision_score(yu_train, train_ur_pred),
                                      'test_precision':metrics.precision_score(yu_test, test_ur_pred),
                                      'train_recall':metrics.recall_score(yu_train, train_ur_pred),
                                      'test_recall':metrics.recall_score(yu_test, test_ur_pred),
                                      'train_f1':metrics.f1_score(yu_train, train_ur_pred),
                                      'test_f1':metrics.f1_score(yu_test, test_ur_pred)}

In [143]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.6668185442533319,
  'test_accuracy': 0.6706150341685649,
  'train_precision': 0.37962962962962965,
  'test_precision': 0.38261851015801357,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.48800980220549633,
  'test_f1': 0.48394004282655245},
 'GS_LogisticRegression': {'train_accuracy': 0.666704636063333,
  'test_accuracy': 0.6701594533029612,
  'train_precision': 0.3795262728015246,
  'test_precision': 0.38218714768883877,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.487924396219811,
  'test_f1': 0.48359486447931527},
 'GS_URLogisticRegression': {'train_accuracy': 0.6731504571903575,
  'test_accuracy': 0.6549202127659575,
  'train_precision': 0.3138592750533049,
  'test_precision': 0.2801358234295416,
  'train_recall': 0.6733760292772186,
  'test_recall': 0.6346153846153846,
  'train_f1': 0.42815590459569514,
  'test_f1': 0.38869257950530034},
 'URLogisticR

### Overrep

In [67]:
overrep_df = modeling_df[modeling_df['underrep'] == 0]

X_or = overrep_df.drop(columns = 'target', axis =1)
y_or = overrep_df['target']

Xo_train, Xo_test, yo_train, yo_test = train_test_split(X_or, y_or, test_size = 0.2, random_state = 20)


In [68]:
lr_or_fsm = LogisticRegression(max_iter = 10000, C = 10000, random_state = 20, class_weight = 'balanced')
lr_or_fsm.fit(Xo_train[features], yo_train)

train_or_pred = lr_or_fsm.predict(Xo_train[features])
test_or_pred = lr_or_fsm.predict(Xo_test[features])


In [69]:
coef = dict(zip(list(features), list(lr_or_fsm.coef_[0])))
sorted_dict1 = sorted(coef.items(), key=lambda kv: kv[1])
sorted_dict1.reverse()
sorted_dict1

[('engineering', 0.9141941837290813),
 ('X2STU30OCC_STEM1', 0.7890787613546657),
 ('compsci', 0.7424814557238434),
 ('misc_class', 0.4895197359444009),
 ('chem', 0.48720534070259564),
 ('physics', 0.4754496936859682),
 ('S2SUSEJOB', 0.41877807174489423),
 ('S2SCLUB', 0.407959327067503),
 ('S2SSUMMERPRG', 0.3416796163099416),
 ('S2SCOMPETE', 0.33409883775641386),
 ('S2SEMPREC', 0.30959623289334653),
 ('S2SCAREER', 0.3005712522145897),
 ('S2SPERSON1', 0.2565461654877844),
 ('S2SDOWELL', 0.25365375739854706),
 ('S2SENJOYS', 0.22902158460148012),
 ('S2SPERSON2', 0.22772217319388816),
 ('enviro', 0.16481204387152013),
 ('S2SPARREC', 0.1485614106161944),
 ('S2SCHALLENGE', 0.12017622777655056),
 ('S2STCHINTRST', 0.09789913784133536),
 ('S2SUSECLG', 0.08747514729127265),
 ('bio', 0.06157812382429152),
 ('S2SLEARN', 0.057400729964577596),
 ('S2SENJOYING', 0.05579211677375349),
 ('S2SHSREQ', 0.03373219066761498),
 ('S2STESTS', 0.027075672930474026),
 ('S2SBORN', 0.014113602864822912),
 ('S2STCHR

In [70]:
metric_dict['ORLogisticRegression'] = {'train_accuracy': metrics.accuracy_score(yo_train, train_or_pred),
                                      'test_accuracy': metrics.accuracy_score(yo_test, test_or_pred),
                                      'train_precision':metrics.precision_score(yo_train, train_or_pred),
                                      'test_precision':metrics.precision_score(yo_test, test_or_pred),
                                      'train_recall':metrics.recall_score(yo_train, train_or_pred),
                                      'test_recall':metrics.recall_score(yo_test, test_or_pred),
                                      'train_f1':metrics.f1_score(yo_train, train_or_pred),
                                      'test_f1':metrics.f1_score(yo_test, test_or_pred)}

In [71]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.666704636063333,
  'test_accuracy': 0.6701594533029612,
  'train_precision': 0.3795262728015246,
  'test_precision': 0.38218714768883877,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.487924396219811,
  'test_f1': 0.48359486447931527},
 'URLogisticRegression': {'train_accuracy': 0.6729842061512884,
  'test_accuracy': 0.6555851063829787,
  'train_precision': 0.31356655290102387,
  'test_precision': 0.28061224489795916,
  'train_recall': 0.6724611161939615,
  'test_recall': 0.6346153846153846,
  'train_f1': 0.42769857433808556,
  'test_f1': 0.3891509433962264},
 'ORLogisticRegression': {'train_accuracy': 0.6877713458755427,
  'test_accuracy': 0.6816208393632417,
  'train_precision': 0.546408393866021,
  'test_precision': 0.5084745762711864,
  'train_recall': 0.6922290388548057,
  'test_recall': 0.6666666666666666,
  'train_f1': 0.6107352277852954,
  'test_f1': 0.576923076923077}}

## Decision Tree - Initial

In [72]:
dtc_all = DecisionTreeClassifier(random_state = 20)

param_dict={'max_depth': range(1,10),
            'criterion': ['gini', 'entropy'],
            'splitter': ['random', 'best'],
            'max_features': ['auto', 'sqrt', 'log2']}

grid_tree=GridSearchCV(dtc_all, 
                       param_dict, 
                       cv=10, 
                       scoring='f1', 
                       verbose=1, 
                       n_jobs=-1)

In [73]:
grid_tree.fit(X_train[features],y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done 1073 out of 1080 | elapsed:   23.8s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:   23.9s finished


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=20),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 10),
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['random', 'best']},
             scoring='f1', verbose=1)

In [74]:

dtc_importance = grid_tree.best_estimator_.feature_importances_
feature_importance = dict(zip(features, dtc_importance))
sorted_dict = sorted(feature_importance.items(), key=lambda kv: kv[1])
sorted_dict.reverse()
sorted_dict

[('S2SENJOYS', 0.19834269849943173),
 ('S2SUSEJOB', 0.14191150206068948),
 ('S2SPERSON2', 0.0867829447252782),
 ('X2STU30OCC_STEM1', 0.060497793969734405),
 ('compsci', 0.04312007300366041),
 ('S2SASSIGNED', 0.02957971704615505),
 ('S2SENJOYING', 0.023274191793686453),
 ('S2STCHEASY', 0.021066562866945308),
 ('S2SCLUB', 0.020432104996044424),
 ('S2SFRIEND', 0.01984322511781452),
 ('physics', 0.018510917733312857),
 ('chem', 0.01809851029058051),
 ('S2SCHALLENGE', 0.015892189104113247),
 ('S2SLEARN', 0.015618739337125487),
 ('engineering', 0.015589923561951031),
 ('S2SHSREQ', 0.014888772829768013),
 ('S2SCOMPETE', 0.014066304204481562),
 ('S2SWASTE', 0.013732179476094376),
 ('S2STESTS', 0.01314660115847262),
 ('S2SUSELIFE', 0.012778605011925013),
 ('S2STCHTREAT', 0.012722185980080946),
 ('S2STEXTBOOK', 0.012473624134255214),
 ('S2SPERSON1', 0.01227899173437178),
 ('S2STCHINTRST', 0.011832443930538667),
 ('S2SCLGSUCC', 0.011827006361587624),
 ('S2SPARREC', 0.011517533818307052),
 ('S2SGR

In [75]:

y_pred_dtc_test = grid_tree.best_estimator_.predict(X_test[features])
y_pred_dtc_train = grid_tree.best_estimator_.predict(X_train[features])

In [76]:
metric_dict['DecisionTree'] = {'train_accuracy': metrics.accuracy_score(y_train, y_pred_dtc_train),
                                  'test_accuracy': metrics.accuracy_score(y_test, y_pred_dtc_test),
                                  'train_precision':metrics.precision_score(y_train, y_pred_dtc_train),
                                  'test_precision':metrics.precision_score(y_test, y_pred_dtc_test),
                                  'train_recall':metrics.recall_score(y_train, y_pred_dtc_train),
                                  'test_recall':metrics.recall_score(y_test, y_pred_dtc_test),
                                  'train_f1':metrics.f1_score(y_train, y_pred_dtc_train),
                                  'test_f1':metrics.f1_score(y_test, y_pred_dtc_test)}


In [77]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.666704636063333,
  'test_accuracy': 0.6701594533029612,
  'train_precision': 0.3795262728015246,
  'test_precision': 0.38218714768883877,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.487924396219811,
  'test_f1': 0.48359486447931527},
 'URLogisticRegression': {'train_accuracy': 0.6729842061512884,
  'test_accuracy': 0.6555851063829787,
  'train_precision': 0.31356655290102387,
  'test_precision': 0.28061224489795916,
  'train_recall': 0.6724611161939615,
  'test_recall': 0.6346153846153846,
  'train_f1': 0.42769857433808556,
  'test_f1': 0.3891509433962264},
 'ORLogisticRegression': {'train_accuracy': 0.6877713458755427,
  'test_accuracy': 0.6816208393632417,
  'train_precision': 0.546408393866021,
  'test_precision': 0.5084745762711864,
  'train_recall': 0.6922290388548057,
  'test_recall': 0.6666666666666666,
  'train_f1': 0.6107352277852954,
  'test_f1': 0.576923076923077},
 'DecisionTree': {'tr

## Grid Search LogREG

In [126]:
clf = LogisticRegression()

param_grid = {
    
    'max_iter': [5000, 10000, 50000, 100000],
    'C': [0.00001, 0.001, 1, 100, 10000, 1000000]
}

gs_LR = GridSearchCV(clf, param_grid, cv=3)
gs_LR.fit(X_train[features], y_train)

gs_LR.best_params_

{'C': 100, 'max_iter': 5000}

In [127]:
lr_gs = LogisticRegression(max_iter = 5000, C = 100, random_state = 20, class_weight = 'balanced')
lr_gs.fit(X_train[features], y_train)

gs_train_pred = lr_gs.predict(X_train[features])
gs_test_pred = lr_gs.predict(X_test[features])

In [166]:
coef_gs = dict(zip(list(features), list(lr_gs.coef_[0])))
sorted_dict = sorted(coef_gs.items(), key=lambda kv: kv[1])
sorted_dict.reverse()
sorted_dict

[('engineering', 1.083561331889614),
 ('compsci', 0.7467732254469444),
 ('S2SSUMMERPRG', 0.5169793747115837),
 ('physics', 0.4480204142502414),
 ('X2STU30OCC_STEM1', 0.4450900345516614),
 ('S2SPERSON1', 0.4063139185191107),
 ('S2SUSEJOB', 0.3970252196693254),
 ('S2SENJOYS', 0.32631602214472305),
 ('S2SCOMPETE', 0.2966996991752658),
 ('chem', 0.28973128344888266),
 ('S2SCLUB', 0.27370477697571577),
 ('S2SCAREER', 0.2678601774903595),
 ('S2SPERSON2', 0.24400427613920647),
 ('S2STESTS', 0.13741994894349133),
 ('S2STCHEASY', 0.13181836188570004),
 ('S2STEXTBOOK', 0.12707023638312526),
 ('S2SFRIEND', 0.11984549475552844),
 ('S2SPARREC', 0.11934217801255906),
 ('S2SCHALLENGE', 0.06720106504110034),
 ('S2SDOWELL', 0.05609060783309761),
 ('S2SUSELIFE', 0.03685089674655132),
 ('S2SUSECLG', 0.035525370669604824),
 ('enviro', 0.03438122467446653),
 ('S2SLEARN', 0.026642677359111643),
 ('S2SCLGSUCC', 0.007933574316279014),
 ('bio', 0.00732335097922541),
 ('S2SBORING', 0.006398617194787702),
 ('S2S

In [129]:

metric_dict['GS_LogisticRegression'] = {'train_accuracy': metrics.accuracy_score(y_train, gs_train_pred),
                                      'test_accuracy': metrics.accuracy_score(y_test, gs_test_pred),
                                      'train_precision':metrics.precision_score(y_train, gs_train_pred),
                                      'test_precision':metrics.precision_score(y_test, gs_test_pred),
                                      'train_recall':metrics.recall_score(y_train, gs_train_pred),
                                      'test_recall':metrics.recall_score(y_test, gs_test_pred),
                                      'train_f1':metrics.f1_score(y_train, gs_train_pred),
                                      'test_f1':metrics.f1_score(y_test, gs_test_pred)}

In [130]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.6668185442533319,
  'test_accuracy': 0.6706150341685649,
  'train_precision': 0.37962962962962965,
  'test_precision': 0.38261851015801357,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.48800980220549633,
  'test_f1': 0.48394004282655245},
 'GS_LogisticRegression': {'train_accuracy': 0.666704636063333,
  'test_accuracy': 0.6701594533029612,
  'train_precision': 0.3795262728015246,
  'test_precision': 0.38218714768883877,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.487924396219811,
  'test_f1': 0.48359486447931527}}

### GridSearch UR

In [133]:
clf = LogisticRegression()

param_grid = {
    
    'max_iter': [5000, 10000, 50000, 100000],
    'C': [0.00001, 0.001, 1, 100, 10000, 1000000]
}

gs_LR_ur = GridSearchCV(clf, param_grid, cv=5)
gs_LR_ur.fit(Xu_train[features], yu_train)

gs_LR_ur.best_params_

{'C': 1, 'max_iter': 5000}

In [134]:
lr_ur_gs = LogisticRegression(max_iter = 5000, C = 1, random_state = 20, class_weight = 'balanced')
lr_ur_gs.fit(Xu_train[features], yu_train)

gstrain_ur_pred = lr_ur_gs.predict(Xu_train[features])
gstest_ur_pred = lr_ur_gs.predict(Xu_test[features])


In [165]:
coef_gs_ur = dict(zip(list(features), list(lr_ur_gs.coef_[0])))
sorted_dict2 = sorted(coef_gs_ur.items(), key=lambda kv: kv[1])
sorted_dict2.reverse()
sorted_dict2

[('engineering', 0.9410029971033435),
 ('compsci', 0.5791750254398281),
 ('S2SSUMMERPRG', 0.5330158548167507),
 ('physics', 0.4985073989123022),
 ('S2SCLUB', 0.42448485535483144),
 ('X2STU30OCC_STEM1', 0.4003135875705004),
 ('S2SCAREER', 0.37463004053656945),
 ('S2SPERSON1', 0.35003412614339735),
 ('S2SPERSON2', 0.2833699061110314),
 ('S2SENJOYS', 0.2610872850901317),
 ('S2SCOMPETE', 0.24572817846348055),
 ('S2SUSELIFE', 0.23703494898963667),
 ('chem', 0.20417178597471572),
 ('S2SUSEJOB', 0.18454659124166792),
 ('S2STEXTBOOK', 0.17971592273688886),
 ('S2SCHALLENGE', 0.17146989572067348),
 ('S2STESTS', 0.09058216174606468),
 ('S2SDOWELL', 0.05730663336410187),
 ('enviro', 0.05507824510677774),
 ('S2SCLGSUCC', 0.054380771915705485),
 ('S2STCHGIVEUP', 0.0409908801182111),
 ('S2SBORN', 0.0358763899815931),
 ('S2SASSEXCL', 0.03141293323406447),
 ('bio', 0.030150568348038777),
 ('S2SPARREC', 0.024969074032367783),
 ('took_science_2012', 0.011341975141592029),
 ('S2SGROUP', 0.0082956974147226

In [136]:
metric_dict['GS_URLogisticRegression'] = {'train_accuracy': metrics.accuracy_score(yu_train, gstrain_ur_pred),
                                      'test_accuracy': metrics.accuracy_score(yu_test, gstest_ur_pred),
                                      'train_precision':metrics.precision_score(yu_train, gstrain_ur_pred),
                                      'test_precision':metrics.precision_score(yu_test, gstest_ur_pred),
                                      'train_recall':metrics.recall_score(yu_train, gstrain_ur_pred),
                                      'test_recall':metrics.recall_score(yu_test, gstest_ur_pred),
                                      'train_f1':metrics.f1_score(yu_train, gstrain_ur_pred),
                                      'test_f1':metrics.f1_score(yu_test, gstest_ur_pred)}

In [144]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.6668185442533319,
  'test_accuracy': 0.6706150341685649,
  'train_precision': 0.37962962962962965,
  'test_precision': 0.38261851015801357,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.48800980220549633,
  'test_f1': 0.48394004282655245},
 'GS_LogisticRegression': {'train_accuracy': 0.666704636063333,
  'test_accuracy': 0.6701594533029612,
  'train_precision': 0.3795262728015246,
  'test_precision': 0.38218714768883877,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.487924396219811,
  'test_f1': 0.48359486447931527},
 'GS_URLogisticRegression': {'train_accuracy': 0.6731504571903575,
  'test_accuracy': 0.6549202127659575,
  'train_precision': 0.3138592750533049,
  'test_precision': 0.2801358234295416,
  'train_recall': 0.6733760292772186,
  'test_recall': 0.6346153846153846,
  'train_f1': 0.42815590459569514,
  'test_f1': 0.38869257950530034},
 'URLogisticR

## SELECT KBEST

In [169]:
selector = SelectKBest(f_regression, k = 20)
selector.fit(X_train[features], y_train)

SelectKBest(k=20, score_func=<function f_regression at 0x1a1661eea0>)

In [170]:
selected_columns = X_train[features].columns[selector.get_support()]

In [171]:
selected_columns

Index(['S2SCLUB', 'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SPERSON1', 'S2SPERSON2',
       'S2SUSELIFE', 'S2SUSEJOB', 'S2SENJOYING', 'S2STESTS', 'S2STEXTBOOK',
       'S2SSKILLS', 'S2STCHEASY', 'S2SASSEXCL', 'S2SENJOYS', 'S2SCHALLENGE',
       'S2SCLGSUCC', 'S2SCAREER', 'S2SDOWELL', 'X2STU30OCC_STEM1',
       'engineering'],
      dtype='object')

In [172]:
lr_kbest = LogisticRegression(max_iter = 5000, C = 100, class_weight = 'balanced', random_state =20)

lr_kbest.fit(X_train[selected_columns], y_train)
kbest_train_pred = lr_kbest.predict(X_train[selected_columns])
kbest_test_pred= lr_kbest.predict(X_test[selected_columns])

In [174]:
coef_kbest = dict(zip(list(selected_columns), list(lr_kbest.coef_[0])))
sorted_dict3 = sorted(coef_kbest.items(), key=lambda kv: kv[1])
sorted_dict3.reverse()
sorted_dict3

[('engineering', 1.1865222540213665),
 ('S2SSUMMERPRG', 0.4877643896153283),
 ('S2SPERSON1', 0.449199597213497),
 ('X2STU30OCC_STEM1', 0.4486423324199768),
 ('S2SUSEJOB', 0.40475614049677977),
 ('S2SCOMPETE', 0.3557216120400218),
 ('S2SENJOYS', 0.32431899776290896),
 ('S2SPERSON2', 0.28850501210465035),
 ('S2SCLUB', 0.27344724561477474),
 ('S2SCAREER', 0.24285786398751247),
 ('S2STESTS', 0.15092563413900445),
 ('S2STEXTBOOK', 0.10736529852605234),
 ('S2SCHALLENGE', 0.09302531566734479),
 ('S2SUSELIFE', 0.033370913775717916),
 ('S2STCHEASY', 0.0075105151984913715),
 ('S2SDOWELL', 0.005247807771366452),
 ('S2SENJOYING', -0.00562492277282398),
 ('S2SSKILLS', -0.031103737304240257),
 ('S2SCLGSUCC', -0.1027440042281457),
 ('S2SASSEXCL', -0.10847197525944281)]

In [175]:
metric_dict['KBEST_LogisticRegression'] = {'train_accuracy': metrics.accuracy_score(y_train, kbest_train_pred),
                                      'test_accuracy': metrics.accuracy_score(y_test, kbest_test_pred),
                                      'train_precision':metrics.precision_score(y_train, kbest_train_pred),
                                      'test_precision':metrics.precision_score(y_test, kbest_test_pred),
                                      'train_recall':metrics.recall_score(y_train, kbest_train_pred),
                                      'test_recall':metrics.recall_score(y_test, kbest_test_pred),
                                      'train_f1':metrics.f1_score(y_train, kbest_train_pred),
                                      'test_f1':metrics.f1_score(y_test, kbest_test_pred)}

In [176]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.6668185442533319,
  'test_accuracy': 0.6706150341685649,
  'train_precision': 0.37962962962962965,
  'test_precision': 0.38261851015801357,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.48800980220549633,
  'test_f1': 0.48394004282655245},
 'GS_LogisticRegression': {'train_accuracy': 0.666704636063333,
  'test_accuracy': 0.6701594533029612,
  'train_precision': 0.3795262728015246,
  'test_precision': 0.38218714768883877,
  'train_recall': 0.6829985301322881,
  'test_recall': 0.658252427184466,
  'train_f1': 0.487924396219811,
  'test_f1': 0.48359486447931527},
 'GS_URLogisticRegression': {'train_accuracy': 0.6731504571903575,
  'test_accuracy': 0.6549202127659575,
  'train_precision': 0.3138592750533049,
  'test_precision': 0.2801358234295416,
  'train_recall': 0.6733760292772186,
  'test_recall': 0.6346153846153846,
  'train_f1': 0.42815590459569514,
  'test_f1': 0.38869257950530034},
 'URLogisticR