In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline
import plotly
import plotly.figure_factory as ff
from plotly.offline import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier



pd.set_option('display.max_columns', 300)

In [2]:
demo_features = ['X2SEX', 'X2RACE', 'X2DUALLANG', 'X2POVERTY185', 'X2SESQ5_U', 'X2CONTROL', 'X2LOCALE', 'X2REGION']


mvp_features = ['X2STU30OCC_STEM1', 'X2STUEDEXPCT', 'X2S2SSPR12', 'S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 
               'S2SSPR12', 'S2LIFES12', 'S2BIO1S12', 'S2BIO2S12', 'S2APBIOS12', 
               'S2IBIOS12', 'S2ANATOMYS12', 'S2OTHBIOS12', 'S2CHEM1S12', 'S2CHEM2S12', 'S2APCHEM12', 
               'S2IBCHEM12', 'S2EARTHS12', 'S2APENVS12', 'S2OTHENVS12', 'S2PHYSIC1S12', 
               'S2PHYSIC2S12', 'S2APPHYSIC12', 'S2IBPHYSIC12', 'S2PHYSS12', 'S2TECHS12', 'S2OTHPHYS12', 
               'S2INTGS1S12', 'S2INTGS2S12', 'S2GENS12', 'S2COMPAPP12', 'S2COMPPROG12', 
               'S2APCOMPSCI12', 'S2IBTECH12', 'S2OTHCOMP12', 'S2ENGINEER12', 'S2OTHS12', 'S2OTHS12SP', 
               'S2HISCIENCE12', 'S2APSCIENCE', 'S2IBSCIENCE', 'S2STOOKBEFORE', 'S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED', 'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 
               'S2SASSEXCL', 'S2APSCIENCE', 'S2HSPLAN', 'S2SUBMITPLAN', 'S2SCLUB', 
            'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED', 'X4RFDGMJ123', 'X4RFDGMJSTEM']

family_features = ['X2PAR1EDU', 'X2PAR1OCC_STEM1', 
            'X2PAR1RACE', 'X2PAR2EDU', 'X2PAR2OCC_STEM1', 'X2PAR2RACE', 'X2PARPATTERN', 
            'X2MOMEDU', 'X2MOMOCC_STEM1', 'X2MOMRACE', 'X2DADEDU', 'X2DADOCC_STEM1', 
            'X2DADRACE']

cols_list = demo_features + mvp_features



In [3]:
df = pd.read_csv('data-files/HSLS/hsls_17_student_pets_sr_v1_0.csv', usecols = cols_list)
df.rename(columns = {'X4RFDGMJSTEM': 'target'}, inplace = True)

In [4]:
#drop rows with non-response to S2SLEARN (and many other features) (likely dropped from study)
df = df[df['S2SLEARN'] != -8]

#create dummy variables for races
df['ai_an'] = np.where(df['X2RACE'] == 1, 1, 0)
df['asian'] = np.where(df['X2RACE'] == 2, 1, 0)
df['black'] = np.where(df['X2RACE'] == 3, 1, 0)
df['hispanic'] = np.where((df['X2RACE'] == 4) | (df['X2RACE'] == 5), 1, 0)
df['multiple_race'] = np.where(df['X2RACE'] == 6, 1, 0)
df['nh_pi'] = np.where(df['X2RACE'] == 7, 1, 0)
df['white'] = np.where(df['X2RACE'] == 8, 1, 0)


In [5]:
#'X2DUALLANG', 'X2POVERTY185', 'X2SESQ5_U', 'X2CONTROL', 'X2LOCALE', 'X2REGION'

#create dummy for public/private school
df['private'] = [1 if x == 2 else 0 for x in df['X2CONTROL']]
df['public'] = [1 if x == 1 else 0 for x in df['X2CONTROL']]

In [6]:
#compile all subchoices of STEM domains into yes/no
df.X2STU30OCC_STEM1.replace({-9:0, 9:0, 4:1, 5:1, 6:1}, inplace = True)


In [7]:
classes = ['S2SSPR12', 'S2LIFES12', 'S2BIO1S12', 'S2BIO2S12', 'S2APBIOS12', 
               'S2IBIOS12', 'S2ANATOMYS12', 'S2OTHBIOS12', 'S2CHEM1S12', 'S2CHEM2S12', 'S2APCHEM12', 
               'S2IBCHEM12', 'S2EARTHS12', 'S2APENVS12', 'S2OTHENVS12', 'S2PHYSIC1S12', 
               'S2PHYSIC2S12', 'S2APPHYSIC12', 'S2IBPHYSIC12', 'S2PHYSS12', 'S2TECHS12', 'S2OTHPHYS12', 
               'S2INTGS1S12', 'S2INTGS2S12', 'S2GENS12', 'S2COMPAPP12', 'S2COMPPROG12', 
               'S2APCOMPSCI12', 'S2IBTECH12', 'S2OTHCOMP12', 'S2ENGINEER12', 'S2OTHS12', 'S2APSCIENCE', 'S2IBSCIENCE']

#impute 'no' for items that are missing or were skipped due to not taking a science class
for col in classes:
    df[col].replace({-9:0, -7:0}, inplace= True)


In [8]:
df.S2SSPR12.value_counts()

1    16192
0     4402
Name: S2SSPR12, dtype: int64

In [9]:
#impute unknown with 'no' for if participating in science activity

clubs_cols = ['S2SCLUB', 'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED']

for col in clubs_cols:
    df[col].replace({-9:0}, inplace = True)

In [10]:
df.S2HSPLAN = np.where(df['S2HSPLAN'] == 1, 1, 0)
df.S2SUBMITPLAN = np.where(df['S2SUBMITPLAN'] == 1, 1, 0)

In [11]:
#create class for underrepresented group in STEM (women, black, american indian, hispanic, pacific islander)

df['underrep'] = np.where((df['X2SEX'] == 2) |
                          (df['ai_an'] == 1) |
                          (df['black'] == 1) |
                          (df['hispanic'] == 1) |
                          (df['multiple_race'] == 1) |
                          (df['nh_pi'] == 1), 1, 0)
                         

In [12]:
#group HS science classes into broader subjects

df['bio'] = np.where((df['S2LIFES12'] == 1) |
                      (df['S2BIO1S12'] == 1) |
                      (df['S2BIO2S12'] == 1) |
                      (df['S2APBIOS12'] == 1) |
                      (df['S2IBIOS12'] == 1) |
                     (df['S2ANATOMYS12'] == 1) |
                      (df['S2OTHBIOS12'] == 1), 1, 0)
               

df['chem'] = np.where((df['S2CHEM1S12'] == 1) |
                      (df['S2CHEM2S12'] == 1) |
                      (df['S2APCHEM12'] == 1) |
                      (df['S2IBCHEM12'] == 1), 1, 0)
              

df['enviro'] = np.where((df['S2EARTHS12'] == 1) |
                        (df['S2EARTHS12'] == 1) |
                        (df['S2APENVS12'] == 1) |
                        (df['S2OTHENVS12'] == 1), 1, 0)
                        
df['physics'] = np.where((df['S2PHYSIC1S12'] == 1) |
                         (df['S2PHYSIC2S12'] == 1) |
                         (df['S2APPHYSIC12'] == 1) |
                         (df['S2IBPHYSIC12'] == 1) |
                         (df['S2PHYSS12'] == 1), 1, 0)
                         
df['engineering'] = np.where((df['S2ENGINEER12'] == 1), 1, 0)


df['compsci'] = np.where((df['S2COMPAPP12'] == 1) |
                         (df['S2COMPPROG12'] == 1) |
                         (df['S2APCOMPSCI12'] == 1) |
                         (df['S2IBTECH12'] == 1) |
                         (df['S2OTHCOMP12'] == 1), 1, 0)

df['misc_class'] = np.where((df['S2OTHPHYS12'] == 1) |
                            (df['S2INTGS1S12'] == 1) |
                            (df['S2GENS12'] == 1), 1, 0)


In [13]:
#create column for students who took science earlier in the year (but don't now)
df['took_science_2012'] = np.where((df['S2STOOKBEFORE'] == 1) |
                                   (df['bio'] == 1) |
                                   (df['chem'] == 1) |
                                   (df['enviro'] == 1) |
                                   (df['physics'] == 1) |
                                   (df['engineering'] == 1) |
                                   (df['compsci'] == 1) |
                                   (df['misc_class'] == 1), 1, 0)


In [14]:
df.took_science_2012.value_counts()

1    18051
0     2543
Name: took_science_2012, dtype: int64

In [15]:
#impute 'no' for items that are missing or were skipped due to not taking a science class

why_science = ['S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED']

for col in why_science:
    df[col].replace({-9:0, -7:0}, inplace= True)

In [16]:
#create dataframe with target variable
modeling_df = df[(df.target == 0) | (df.target == 1)]



In [17]:
likert_cols = ['S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 
                'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 
               'S2SASSEXCL']

#change likert questions to agree/disagree
for col in likert_cols:
   modeling_df[col].replace({2:1, 3:0, 4:0, -9:0, }, inplace = True)

In [18]:
modeling_df.S2SSPR12.value_counts()

1    9381
0    1593
Name: S2SSPR12, dtype: int64

In [19]:
modeling_df.groupby(by = ['target', 'underrep']).took_science_2012.value_counts(normalize = True)

target  underrep  took_science_2012
0       0         1                    0.913410
                  0                    0.086590
        1         1                    0.916964
                  0                    0.083036
1       0         1                    0.955943
                  0                    0.044057
        1         1                    0.945307
                  0                    0.054693
Name: took_science_2012, dtype: float64

In [20]:
modeling_df.underrep.value_counts(normalize = False)

1    7519
0    3455
Name: underrep, dtype: int64

In [21]:
underrep_df = modeling_df[modeling_df['underrep'] == 1]
underrep_df

Unnamed: 0,X2SEX,X2RACE,X2DUALLANG,X2POVERTY185,X2SESQ5_U,X2STU30OCC_STEM1,X2STUEDEXPCT,X2S2SSPR12,X2CONTROL,X2LOCALE,X2REGION,target,X4RFDGMJ123,S2APSCIENCE,S2IBSCIENCE,S2SSPR12,S2LIFES12,S2BIO1S12,S2BIO2S12,S2APBIOS12,S2IBIOS12,S2ANATOMYS12,S2OTHBIOS12,S2CHEM1S12,S2CHEM2S12,S2APCHEM12,S2IBCHEM12,S2EARTHS12,S2APENVS12,S2OTHENVS12,S2PHYSIC1S12,S2PHYSIC2S12,S2APPHYSIC12,S2IBPHYSIC12,S2PHYSS12,S2TECHS12,S2OTHPHYS12,S2INTGS1S12,S2INTGS2S12,S2GENS12,S2COMPAPP12,S2COMPPROG12,S2APCOMPSCI12,S2IBTECH12,S2OTHCOMP12,S2ENGINEER12,S2OTHS12,S2OTHS12SP,S2HISCIENCE12,S2STOOKBEFORE,S2SENJOYS,S2SCHALLENGE,S2SHSREQ,S2SCLGADM,S2SCLGSUCC,S2SCAREER,S2SCNSLREC,S2STCHRREC,S2SPARREC,S2SFAMREC,S2SEMPREC,S2SFRIEND,S2SDOWELL,S2SASSIGNED,S2STCHTREAT,S2STCHINTRST,S2STCHEASY,S2STCHTHINK,S2STCHGIVEUP,S2SENJOYING,S2STEXTBOOK,S2SWASTE,S2SSKILLS,S2STESTS,S2SBORING,S2SASSEXCL,S2HSPLAN,S2SUBMITPLAN,S2SPERSON1,S2SPERSON2,S2SLEARN,S2SBORN,S2SUSELIFE,S2SUSECLG,S2SUSEJOB,S2SCLUB,S2SCOMPETE,S2SSUMMERPRG,S2SGROUP,S2STUTORED,ai_an,asian,black,hispanic,multiple_race,nh_pi,white,private,public,underrep,bio,chem,enviro,physics,engineering,compsci,misc_class,took_science_2012
1,2,8,1,1,2,0,8,1,1,4,1,0,14,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,-5,0,0,0,0,0,0,0,-5,0,0,-5,-5,9,-7,0,1,0,1,1,0,1,1,1,1,0,0,1,0,0,0,1,1,1,0,0,0,0,0,1,1,1,1,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,1
2,2,3,1,0,5,0,12,1,1,2,4,1,8,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,-5,0,0,0,0,0,0,0,-5,0,0,-5,-5,10,-7,0,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1
6,2,8,1,0,4,1,8,1,1,4,1,0,14,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-5,0,1,0,0,0,0,0,-5,0,0,-5,-5,23,-7,0,0,1,1,1,1,0,1,1,0,0,1,1,1,0,0,0,1,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,1
11,2,8,1,0,5,0,10,1,2,2,2,0,9,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-5,0,0,0,0,0,0,0,-5,0,0,-5,-5,17,-7,1,1,1,0,0,0,1,1,1,0,0,1,1,1,0,1,1,1,1,0,0,1,0,0,1,0,0,0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,1,0,0,0,1
14,2,8,1,0,5,0,8,0,1,1,3,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-5,0,0,0,0,0,0,0,-5,0,0,-5,-5,-7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-7,-7,-7,-7,-7,1,1,0,1,1,0,1,0,0,1,1,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23479,2,8,1,0,5,0,8,1,1,4,3,0,7,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-5,0,0,0,0,0,0,0,-5,0,0,-5,-5,3,-7,0,0,0,0,0,0,1,1,1,1,0,0,0,1,0,1,1,1,1,1,0,0,0,0,0,1,1,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,1
23481,2,8,1,0,5,0,9,1,2,1,2,1,3,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,-5,0,0,0,0,0,0,0,-5,0,0,-5,-5,7,-7,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,1,1,1,1,0,0,1,1,0,1,1,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,0,1
23487,2,5,1,0,3,0,10,0,1,3,4,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-5,0,0,0,0,0,0,0,-5,0,0,-5,-5,-7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-7,-7,-7,-7,-7,1,0,0,1,1,0,1,0,0,1,0,1,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1
23490,2,5,3,0,4,0,13,1,1,1,2,0,21,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,-5,0,0,0,0,0,0,0,-5,0,0,-5,-5,16,-7,0,0,1,1,1,1,0,0,0,0,0,0,1,1,0,1,0,1,1,0,1,0,1,1,0,1,1,0,0,0,1,0,1,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,1


### Initial Model - ALL

In [22]:
X = modeling_df.drop(columns = 'target', axis =1)
y = modeling_df['target']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

In [24]:
features = ['S2SCLUB', 'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED', 'S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 
               'S2SASSEXCL', 'S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED', 'X2STU30OCC_STEM1', 'took_science_2012', 
            'bio', 'chem', 'enviro', 'physics', 'engineering', 'compsci', 'misc_class', 'X2SEX']
                            

lr_fsm = LogisticRegression(max_iter = 10000, C = 10000, random_state = 20, class_weight = 'balanced')
lr_fsm.fit(X_train[features], y_train)

train_pred = lr_fsm.predict(X_train[features])
test_pred = lr_fsm.predict(X_test[features])



In [269]:
coef = dict(zip(list(features), list(lr_fsm.coef_[0])))
sorted_dict1 = sorted(coef.items(), key=lambda kv: kv[1])
sorted_dict1.reverse()
sorted_dict1

[('engineering', 1.0800034284625175),
 ('compsci', 0.749149005655807),
 ('S2SSUMMERPRG', 0.5114422317776263),
 ('X2STU30OCC_STEM1', 0.4439532751689377),
 ('physics', 0.4224062256983904),
 ('S2SPERSON1', 0.41292629090935923),
 ('S2SUSEJOB', 0.3982328953938054),
 ('S2SENJOYS', 0.32533157942213087),
 ('S2SCOMPETE', 0.30201176544514324),
 ('S2SCLUB', 0.27036958671467864),
 ('S2SCAREER', 0.264377261115631),
 ('chem', 0.26320455546142757),
 ('S2SPERSON2', 0.2447810591299161),
 ('S2STESTS', 0.15010864609096158),
 ('S2STEXTBOOK', 0.1330724505107946),
 ('S2SPARREC', 0.1169965979478818),
 ('S2SFRIEND', 0.11477585106806153),
 ('S2SCHALLENGE', 0.05961811928306865),
 ('S2SDOWELL', 0.048807840305977124),
 ('S2SUSELIFE', 0.03386371374909636),
 ('S2SUSECLG', 0.028864228356104322),
 ('S2SLEARN', 0.021286941255627474),
 ('enviro', 0.019332243869965137),
 ('S2SEMPREC', 0.01822512491178271),
 ('S2SBORING', 0.007862841167269263),
 ('S2SCLGSUCC', 0.003725728266928981),
 ('S2SBORN', -0.00027722538543625326),

In [270]:
metric_dict = {}
metric_dict['LogisticRegression'] = {'train_accuracy': metrics.accuracy_score(y_train, train_pred),
                                      'test_accuracy': metrics.accuracy_score(y_test, test_pred),
                                      'train_precision':metrics.precision_score(y_train, train_pred),
                                      'test_precision':metrics.precision_score(y_test, test_pred),
                                      'train_recall':metrics.recall_score(y_train, train_pred),
                                      'test_recall':metrics.recall_score(y_test, test_pred),
                                      'train_f1':metrics.f1_score(y_train, train_pred),
                                      'test_f1':metrics.f1_score(y_test, test_pred)}

### Initial - Underrep

In [272]:
X_ur = underrep_df.drop(columns = 'target', axis =1)
y_ur = underrep_df['target']

In [273]:
Xu_train, Xu_test, yu_train, yu_test = train_test_split(X_ur, y_ur, test_size = 0.2, random_state = 20)

In [284]:
lr_ur_fsm = LogisticRegression(max_iter = 10000, C = 10000, random_state = 20, class_weight = 'balanced')
lr_ur_fsm.fit(Xu_train[features], yu_train)

train_ur_pred = lr_ur_fsm.predict(Xu_train[features])
test_ur_pred = lr_ur_fsm.predict(Xu_test[features])


In [281]:
coef = dict(zip(list(features), list(lr_ur_fsm.coef_[0])))
sorted_dict1 = sorted(coef.items(), key=lambda kv: kv[1])
sorted_dict1.reverse()
sorted_dict1

[('engineering', 1.0069478063139476),
 ('compsci', 0.5925317094154284),
 ('S2SSUMMERPRG', 0.534649804103606),
 ('physics', 0.43336325195311043),
 ('S2SCLUB', 0.42591297871725736),
 ('X2STU30OCC_STEM1', 0.4150947394192837),
 ('S2SCAREER', 0.3731463020937605),
 ('S2SPERSON1', 0.3679358001415783),
 ('S2SPERSON2', 0.2888017421726653),
 ('S2SCOMPETE', 0.25570050057282634),
 ('S2SENJOYS', 0.23859686376098202),
 ('S2SUSELIFE', 0.23748736581418353),
 ('S2SUSEJOB', 0.1809791305812877),
 ('S2STEXTBOOK', 0.1668512834514574),
 ('S2SCHALLENGE', 0.15236417448424344),
 ('chem', 0.1478330708245279),
 ('S2STESTS', 0.09055178964727684),
 ('S2SCLGSUCC', 0.03273523212931524),
 ('S2SASSEXCL', 0.0318194047153499),
 ('S2SBORN', 0.031260714404630935),
 ('S2SPARREC', 0.02487793803996917),
 ('S2SDOWELL', 0.021624987706571955),
 ('S2SWASTE', 0.014501550508275224),
 ('S2SGROUP', 0.013854269936199792),
 ('enviro', 0.004538181305145458),
 ('S2SFRIEND', -0.005210993709941),
 ('S2SBORING', -0.007787029324933782),
 ('

In [282]:
metric_dict['URLogisticRegression'] = {'train_accuracy': metrics.accuracy_score(yu_train, train_ur_pred),
                                      'test_accuracy': metrics.accuracy_score(yu_test, test_ur_pred),
                                      'train_precision':metrics.precision_score(yu_train, train_ur_pred),
                                      'test_precision':metrics.precision_score(yu_test, test_ur_pred),
                                      'train_recall':metrics.recall_score(yu_train, train_ur_pred),
                                      'test_recall':metrics.recall_score(yu_test, test_ur_pred),
                                      'train_f1':metrics.f1_score(yu_train, train_ur_pred),
                                      'test_f1':metrics.f1_score(yu_test, test_ur_pred)}

### Overrep

In [286]:
overrep_df = modeling_df[modeling_df['underrep'] == 0]

X_or = overrep_df.drop(columns = 'target', axis =1)
y_or = overrep_df['target']

Xo_train, Xo_test, yo_train, yo_test = train_test_split(X_or, y_or, test_size = 0.2, random_state = 20)


In [287]:
lr_or_fsm = LogisticRegression(max_iter = 10000, C = 10000, random_state = 20, class_weight = 'balanced')
lr_or_fsm.fit(Xo_train[features], yo_train)

train_or_pred = lr_or_fsm.predict(Xo_train[features])
test_or_pred = lr_or_fsm.predict(Xo_test[features])


In [288]:
coef = dict(zip(list(features), list(lr_or_fsm.coef_[0])))
sorted_dict1 = sorted(coef.items(), key=lambda kv: kv[1])
sorted_dict1.reverse()
sorted_dict1

[('engineering', 0.8991879517434294),
 ('X2STU30OCC_STEM1', 0.7884251810784615),
 ('compsci', 0.7374946980860351),
 ('misc_class', 0.488614000410563),
 ('chem', 0.46008807743137237),
 ('physics', 0.4509259654525843),
 ('S2SUSEJOB', 0.42230533827532146),
 ('S2SCLUB', 0.40804835875837797),
 ('S2SSUMMERPRG', 0.3419779430595667),
 ('S2SCOMPETE', 0.3363554749664277),
 ('S2SCAREER', 0.30012483934301565),
 ('S2SEMPREC', 0.2993959742930527),
 ('S2SPERSON1', 0.26031753136331637),
 ('S2SDOWELL', 0.24095621854593047),
 ('S2SPERSON2', 0.22639342441071664),
 ('S2SENJOYS', 0.21655077610709317),
 ('S2SPARREC', 0.14805476642010468),
 ('enviro', 0.13977292897516425),
 ('S2SCHALLENGE', 0.1169655190176675),
 ('S2SUSECLG', 0.07975729210479916),
 ('S2SENJOYING', 0.07697818480098213),
 ('S2SLEARN', 0.06283512816380846),
 ('bio', 0.035903599414143055),
 ('S2STESTS', 0.02637598237083904),
 ('S2SHSREQ', 0.02482555396136357),
 ('S2SBORN', 0.01503685961630344),
 ('S2STCHRREC', 0.005633699730942853),
 ('S2SFAMREC

In [289]:
metric_dict['ORLogisticRegression'] = {'train_accuracy': metrics.accuracy_score(yo_train, train_or_pred),
                                      'test_accuracy': metrics.accuracy_score(yo_test, test_or_pred),
                                      'train_precision':metrics.precision_score(yo_train, train_or_pred),
                                      'test_precision':metrics.precision_score(yo_test, test_or_pred),
                                      'train_recall':metrics.recall_score(yo_train, train_or_pred),
                                      'test_recall':metrics.recall_score(yo_test, test_or_pred),
                                      'train_f1':metrics.f1_score(yo_train, train_or_pred),
                                      'test_f1':metrics.f1_score(yo_test, test_or_pred)}

{'LogisticRegression': {'train_accuracy': 0.6653377377833466,
  'test_accuracy': 0.675626423690205,
  'train_precision': 0.37809187279151946,
  'test_precision': 0.3892013498312711,
  'train_recall': 0.6815286624203821,
  'test_recall': 0.6718446601941748,
  'train_f1': 0.48636363636363633,
  'test_f1': 0.4928774928774928},
 'URLogisticRegression': {'train_accuracy': 0.6761429758935993,
  'test_accuracy': 0.660904255319149,
  'train_precision': 0.3169164882226981,
  'test_precision': 0.2895622895622896,
  'train_recall': 0.6770356816102471,
  'test_recall': 0.6615384615384615,
  'train_f1': 0.43173862310385064,
  'test_f1': 0.4028103044496487},
 'ORLogisticRegression': {'train_accuracy': 0.6903039073806078,
  'test_accuracy': 0.6845151953690304,
  'train_precision': 0.5498366013071896,
  'test_precision': 0.5121107266435986,
  'train_recall': 0.6881390593047034,
  'test_recall': 0.6577777777777778,
  'train_f1': 0.6112624886466849,
  'test_f1': 0.5758754863813229}}

### Decision Tree - Initial - ALL

In [292]:
dtc_all = DecisionTreeClassifier(random_state = 20)

param_dict={'max_depth': range(1,10),
            'criterion': ['gini', 'entropy'],
            'splitter': ['random', 'best'],
            'max_features': ['auto', 'sqrt', 'log2']}

grid_tree=GridSearchCV(dtc_all, 
                       param_dict, 
                       cv=10, 
                       scoring='f1', 
                       verbose=1, 
                       n_jobs=-1)

In [294]:
grid_tree.fit(X_train[features],y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 656 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:   18.7s finished


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=20),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 10),
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['random', 'best']},
             scoring='f1', verbose=1)

In [295]:

dtc_importance = grid_tree.best_estimator_.feature_importances_
feature_importance = dict(zip(features, dtc_importance))
sorted_dict = sorted(feature_importance.items(), key=lambda kv: kv[1])
sorted_dict.reverse()
sorted_dict

[('S2SCAREER', 0.18607920493411315),
 ('S2SPERSON1', 0.15185241822149154),
 ('X2STU30OCC_STEM1', 0.06365078111743029),
 ('S2SPERSON2', 0.05139012861186232),
 ('S2SCOMPETE', 0.044278056717184454),
 ('physics', 0.033428006675654806),
 ('S2SENJOYS', 0.0327790969921709),
 ('chem', 0.031698915579143734),
 ('S2SASSIGNED', 0.027536665335924353),
 ('S2SUSEJOB', 0.027388002937777194),
 ('engineering', 0.02550758473129363),
 ('S2STEXTBOOK', 0.017920804672568312),
 ('compsci', 0.017762995340022653),
 ('S2SSUMMERPRG', 0.017388783478810167),
 ('S2SUSELIFE', 0.016344809696645164),
 ('S2SCLGSUCC', 0.015581031976172562),
 ('S2SWASTE', 0.015428978652268934),
 ('S2SHSREQ', 0.014846819870572379),
 ('S2SBORN', 0.014446988998836442),
 ('S2SCHALLENGE', 0.01400070766522031),
 ('S2STCHRREC', 0.013606877128232386),
 ('S2STESTS', 0.012466371270407456),
 ('S2SDOWELL', 0.012321185143575723),
 ('S2SCNSLREC', 0.011208878956633273),
 ('S2SENJOYING', 0.010797663120588462),
 ('S2SPARREC', 0.010705909613990473),
 ('S2S

In [296]:

y_pred_dtc_test = grid_tree.best_estimator_.predict(X_test[features])
y_pred_dtc_train = grid_tree.best_estimator_.predict(X_train[features])

In [297]:
metric_dict['DecisionTree'] = {'train_accuracy': metrics.accuracy_score(y_train, y_pred_dtc_train),
                                  'test_accuracy': metrics.accuracy_score(y_test, y_pred_dtc_test),
                                  'train_precision':metrics.precision_score(y_train, y_pred_dtc_train),
                                  'test_precision':metrics.precision_score(y_test, y_pred_dtc_test),
                                  'train_recall':metrics.recall_score(y_train, y_pred_dtc_train),
                                  'test_recall':metrics.recall_score(y_test, y_pred_dtc_test),
                                  'train_f1':metrics.f1_score(y_train, y_pred_dtc_train),
                                  'test_f1':metrics.f1_score(y_test, y_pred_dtc_test)}


In [298]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.6653377377833466,
  'test_accuracy': 0.675626423690205,
  'train_precision': 0.37809187279151946,
  'test_precision': 0.3892013498312711,
  'train_recall': 0.6815286624203821,
  'test_recall': 0.6718446601941748,
  'train_f1': 0.48636363636363633,
  'test_f1': 0.4928774928774928},
 'URLogisticRegression': {'train_accuracy': 0.6761429758935993,
  'test_accuracy': 0.660904255319149,
  'train_precision': 0.3169164882226981,
  'test_precision': 0.2895622895622896,
  'train_recall': 0.6770356816102471,
  'test_recall': 0.6615384615384615,
  'train_f1': 0.43173862310385064,
  'test_f1': 0.4028103044496487},
 'ORLogisticRegression': {'train_accuracy': 0.6903039073806078,
  'test_accuracy': 0.6845151953690304,
  'train_precision': 0.5498366013071896,
  'test_precision': 0.5121107266435986,
  'train_recall': 0.6881390593047034,
  'test_recall': 0.6577777777777778,
  'train_f1': 0.6112624886466849,
  'test_f1': 0.5758754863813229},
 'DecisionTree': {'t