In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline
import plotly
import plotly.figure_factory as ff
from plotly.offline import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_regression,mutual_info_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier



pd.set_option('display.max_columns', 300)

In [3]:
demo_features = ['X2SEX', 'X2RACE', 'X2DUALLANG', 'X2POVERTY185', 'X2SESQ5_U', 'X2CONTROL', 'X2LOCALE', 'X2REGION']


mvp_features = ['X2STU30OCC_STEM1', 'X2STUEDEXPCT', 'X2S2SSPR12', 'S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 
               'S2SSPR12', 'S2LIFES12', 'S2BIO1S12', 'S2BIO2S12', 'S2APBIOS12', 
               'S2IBIOS12', 'S2ANATOMYS12', 'S2OTHBIOS12', 'S2CHEM1S12', 'S2CHEM2S12', 'S2APCHEM12', 
               'S2IBCHEM12', 'S2EARTHS12', 'S2APENVS12', 'S2OTHENVS12', 'S2PHYSIC1S12', 
               'S2PHYSIC2S12', 'S2APPHYSIC12', 'S2IBPHYSIC12', 'S2PHYSS12', 'S2TECHS12', 'S2OTHPHYS12', 
               'S2INTGS1S12', 'S2INTGS2S12', 'S2GENS12', 'S2COMPAPP12', 'S2COMPPROG12', 
               'S2APCOMPSCI12', 'S2IBTECH12', 'S2OTHCOMP12', 'S2ENGINEER12', 'S2OTHS12', 'S2OTHS12SP', 
               'S2HISCIENCE12', 'S2APSCIENCE', 'S2IBSCIENCE', 'S2STOOKBEFORE', 'S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED', 'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 
               'S2SASSEXCL', 'S2APSCIENCE', 'S2HSPLAN', 'S2SUBMITPLAN', 'S2SCLUB', 
            'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED', 'X4RFDGMJ123', 'X4RFDGMJSTEM']

family_features = ['X2PAR1EDU', 'X2PAR1OCC_STEM1', 
            'X2PAR1RACE', 'X2PAR2EDU', 'X2PAR2OCC_STEM1', 'X2PAR2RACE', 'X2PARPATTERN', 
            'X2MOMEDU', 'X2MOMOCC_STEM1', 'X2MOMRACE', 'X2DADEDU', 'X2DADOCC_STEM1', 
            'X2DADRACE']

cols_list = demo_features + mvp_features



In [4]:
df = pd.read_csv('data-files/HSLS/hsls_17_student_pets_sr_v1_0.csv', usecols = cols_list)
df.rename(columns = {'X4RFDGMJSTEM': 'target'}, inplace = True)

In [5]:
#drop rows with non-response to S2SLEARN (and many other features) (likely dropped from study)
df = df[df['S2SLEARN'] != -8]

#create dummy variables for races
df['ai_an'] = np.where(df['X2RACE'] == 1, 1, 0)
df['asian'] = np.where(df['X2RACE'] == 2, 1, 0)
df['black'] = np.where(df['X2RACE'] == 3, 1, 0)
df['hispanic'] = np.where((df['X2RACE'] == 4) | (df['X2RACE'] == 5), 1, 0)
df['multiple_race'] = np.where(df['X2RACE'] == 6, 1, 0)
df['nh_pi'] = np.where(df['X2RACE'] == 7, 1, 0)
df['white'] = np.where(df['X2RACE'] == 8, 1, 0)


In [6]:
#'X2DUALLANG', 'X2POVERTY185', 'X2SESQ5_U', 'X2CONTROL', 'X2LOCALE', 'X2REGION'

#create dummy for public/private school
df['private'] = [1 if x == 2 else 0 for x in df['X2CONTROL']]
df['public'] = [1 if x == 1 else 0 for x in df['X2CONTROL']]

In [7]:
#compile all subchoices of STEM domains into yes/no
df.X2STU30OCC_STEM1.replace({-9:0, 9:0, 4:1, 5:1, 6:1}, inplace = True)


In [8]:
classes = ['S2SSPR12', 'S2LIFES12', 'S2BIO1S12', 'S2BIO2S12', 'S2APBIOS12', 
               'S2IBIOS12', 'S2ANATOMYS12', 'S2OTHBIOS12', 'S2CHEM1S12', 'S2CHEM2S12', 'S2APCHEM12', 
               'S2IBCHEM12', 'S2EARTHS12', 'S2APENVS12', 'S2OTHENVS12', 'S2PHYSIC1S12', 
               'S2PHYSIC2S12', 'S2APPHYSIC12', 'S2IBPHYSIC12', 'S2PHYSS12', 'S2TECHS12', 'S2OTHPHYS12', 
               'S2INTGS1S12', 'S2INTGS2S12', 'S2GENS12', 'S2COMPAPP12', 'S2COMPPROG12', 
               'S2APCOMPSCI12', 'S2IBTECH12', 'S2OTHCOMP12', 'S2ENGINEER12', 'S2OTHS12', 'S2APSCIENCE', 'S2IBSCIENCE']

#impute 'no' for items that are missing or were skipped due to not taking a science class
for col in classes:
    df[col].replace({-9:0, -7:0}, inplace= True)


In [9]:
df.S2SSPR12.value_counts()

1    16192
0     4402
Name: S2SSPR12, dtype: int64

In [10]:
#impute unknown with 'no' for if participating in science activity

clubs_cols = ['S2SCLUB', 'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED']

for col in clubs_cols:
    df[col].replace({-9:0}, inplace = True)

In [11]:
df.S2HSPLAN = np.where(df['S2HSPLAN'] == 1, 1, 0)
df.S2SUBMITPLAN = np.where(df['S2SUBMITPLAN'] == 1, 1, 0)

In [12]:
#create class for underrepresented group in STEM (women, black, american indian, hispanic, pacific islander)

df['underrep'] = np.where((df['X2SEX'] == 2) |
                          (df['ai_an'] == 1) |
                          (df['black'] == 1) |
                          (df['hispanic'] == 1) |
                          (df['multiple_race'] == 1) |
                          (df['nh_pi'] == 1), 1, 0)
                         

In [13]:
#group HS science classes into broader subjects

df['bio'] = np.where((df['S2LIFES12'] == 1) |
                      (df['S2BIO1S12'] == 1) |
                      (df['S2BIO2S12'] == 1) |
                      (df['S2APBIOS12'] == 1) |
                      (df['S2IBIOS12'] == 1) |
                     (df['S2ANATOMYS12'] == 1) |
                      (df['S2OTHBIOS12'] == 1), 1, 0)
               

df['chem'] = np.where((df['S2CHEM1S12'] == 1) |
                      (df['S2CHEM2S12'] == 1) |
                      (df['S2APCHEM12'] == 1) |
                      (df['S2IBCHEM12'] == 1), 1, 0)
              

df['enviro'] = np.where((df['S2EARTHS12'] == 1) |
                        (df['S2EARTHS12'] == 1) |
                        (df['S2APENVS12'] == 1) |
                        (df['S2OTHENVS12'] == 1), 1, 0)
                        
df['physics'] = np.where((df['S2PHYSIC1S12'] == 1) |
                         (df['S2PHYSIC2S12'] == 1) |
                         (df['S2APPHYSIC12'] == 1) |
                         (df['S2IBPHYSIC12'] == 1) |
                         (df['S2PHYSS12'] == 1), 1, 0)
                         
df['engineering'] = np.where((df['S2ENGINEER12'] == 1), 1, 0)


df['compsci'] = np.where((df['S2COMPAPP12'] == 1) |
                         (df['S2COMPPROG12'] == 1) |
                         (df['S2APCOMPSCI12'] == 1) |
                         (df['S2IBTECH12'] == 1) |
                         (df['S2OTHCOMP12'] == 1), 1, 0)

df['misc_class'] = np.where((df['S2OTHPHYS12'] == 1) |
                            (df['S2INTGS1S12'] == 1) |
                            (df['S2GENS12'] == 1), 1, 0)


In [14]:
#create column for students who took science earlier in the year (but don't now)
df['took_science_2012'] = np.where((df['S2STOOKBEFORE'] == 1) |
                                   (df['bio'] == 1) |
                                   (df['chem'] == 1) |
                                   (df['enviro'] == 1) |
                                   (df['physics'] == 1) |
                                   (df['engineering'] == 1) |
                                   (df['compsci'] == 1) |
                                   (df['misc_class'] == 1), 1, 0)


In [15]:
df.took_science_2012.value_counts()

1    18051
0     2543
Name: took_science_2012, dtype: int64

In [16]:
#impute 'no' for items that are missing or were skipped due to not taking a science class

why_science = ['S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED']

for col in why_science:
    df[col].replace({-9:0, -7:0}, inplace= True)

In [17]:
#create dataframe with target variable
modeling_df = df[(df.target == 0) | (df.target == 1)]



In [18]:
likert_cols = ['S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 
                'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 
               'S2SASSEXCL']

#change likert questions to agree/disagree
for col in likert_cols:
   modeling_df[col].replace({2:1, 3:0, 4:0, -9:0, }, inplace = True)

In [19]:
modeling_df.S2SSPR12.value_counts()

1    9381
0    1593
Name: S2SSPR12, dtype: int64

In [20]:
modeling_df.groupby(by = ['target', 'underrep']).took_science_2012.value_counts(normalize = True)

target  underrep  took_science_2012
0       0         1                    0.913410
                  0                    0.086590
        1         1                    0.916964
                  0                    0.083036
1       0         1                    0.955943
                  0                    0.044057
        1         1                    0.945307
                  0                    0.054693
Name: took_science_2012, dtype: float64

In [21]:
modeling_df.underrep.value_counts(normalize = False)

1    7519
0    3455
Name: underrep, dtype: int64

In [22]:
underrep_df = modeling_df[modeling_df['underrep'] == 1]


### Initial Model - ALL

In [23]:
X = modeling_df.drop(columns = 'target', axis =1)
y = modeling_df['target']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

In [48]:
features = ['S2SCLUB', 'S2SCOMPETE', 'S2SSUMMERPRG', 'S2SGROUP', 'S2STUTORED', 'S2SPERSON1', 'S2SPERSON2', 
               'S2SLEARN', 'S2SBORN', 'S2SUSELIFE', 'S2SUSECLG', 'S2SUSEJOB', 'S2SENJOYING', 'S2SWASTE', 'S2SBORING', 
               'S2STESTS', 'S2STEXTBOOK', 'S2SSKILLS', 'S2STCHTREAT', 'S2STCHINTRST', 
               'S2STCHEASY', 'S2STCHTHINK', 'S2STCHGIVEUP',
               'S2SASSEXCL', 'S2SENJOYS', 'S2SCHALLENGE', 'S2SHSREQ', 'S2SCLGADM', 
               'S2SCLGSUCC', 'S2SCAREER', 'S2SCNSLREC', 'S2STCHRREC', 'S2SPARREC', 'S2SFAMREC', 
               'S2SEMPREC', 'S2SFRIEND', 'S2SDOWELL', 'S2SASSIGNED', 'X2STU30OCC_STEM1', 'took_science_2012', 'bio', 'chem', 'enviro', 'physics', 'engineering', 'compsci', 'misc_class'
            ]
                            

lr_fsm = LogisticRegression(max_iter = 10000, C = 10000, random_state = 20, class_weight = 'balanced')
lr_fsm.fit(X_train[features], y_train)

train_pred = lr_fsm.predict(X_train[features])
test_pred = lr_fsm.predict(X_test[features])



In [49]:
coef = dict(zip(list(features), list(lr_fsm.coef_[0])))
sorted_dict1 = sorted(coef.items(), key=lambda kv: kv[1])
sorted_dict1.reverse()
sorted_dict1

[('engineering', 1.0837355278210599),
 ('compsci', 0.7482357029549753),
 ('S2SSUMMERPRG', 0.5251251174592942),
 ('X2STU30OCC_STEM1', 0.4516679182066185),
 ('S2SPERSON1', 0.42287596752944834),
 ('S2SUSEJOB', 0.4011801423278924),
 ('physics', 0.3826818869006795),
 ('S2SENJOYS', 0.30402764501976187),
 ('S2SCOMPETE', 0.2998893066401716),
 ('S2SCLUB', 0.27504988483746823),
 ('S2SCAREER', 0.26521438710106393),
 ('S2SPERSON2', 0.24816565950869537),
 ('chem', 0.22496699262581288),
 ('S2STCHEASY', 0.1507073505380401),
 ('S2STESTS', 0.13430600617476843),
 ('S2STEXTBOOK', 0.12674969124986335),
 ('S2SFRIEND', 0.11662833830691577),
 ('S2SPARREC', 0.11384760284895439),
 ('S2SCHALLENGE', 0.051635948429232366),
 ('S2SUSECLG', 0.041451753445497695),
 ('S2SUSELIFE', 0.04050061195171857),
 ('S2SLEARN', 0.03068194730505634),
 ('S2SDOWELL', 0.02490551419777302),
 ('S2STCHTREAT', 0.024311450951337285),
 ('S2SEMPREC', 0.02114841668705439),
 ('S2SBORING', 0.012378980511768372),
 ('S2STCHTHINK', 0.007177255638

In [30]:
metric_dict = {}
metric_dict['LogisticRegression'] = {'train_accuracy': metrics.accuracy_score(y_train, train_pred),
                                      'test_accuracy': metrics.accuracy_score(y_test, test_pred),
                                      'train_precision':metrics.precision_score(y_train, train_pred),
                                      'test_precision':metrics.precision_score(y_test, test_pred),
                                      'train_recall':metrics.recall_score(y_train, train_pred),
                                      'test_recall':metrics.recall_score(y_test, test_pred),
                                      'train_f1':metrics.f1_score(y_train, train_pred),
                                      'test_f1':metrics.f1_score(y_test, test_pred)}

In [28]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.6549720924934502,
  'test_accuracy': 0.6587699316628701,
  'train_precision': 0.36882634094530004,
  'test_precision': 0.37310195227765725,
  'train_recall': 0.6805487506124449,
  'test_recall': 0.6679611650485436,
  'train_f1': 0.47838815223006714,
  'test_f1': 0.47877522616562285}}

### Initial - Underrep

In [50]:
X_ur = underrep_df.drop(columns = 'target', axis =1)
y_ur = underrep_df['target']

In [51]:
Xu_train, Xu_test, yu_train, yu_test = train_test_split(X_ur, y_ur, test_size = 0.2, random_state = 20)

In [52]:
lr_ur_fsm = LogisticRegression(max_iter = 10000, C = 10000, random_state = 20, class_weight = 'balanced')
lr_ur_fsm.fit(Xu_train[features], yu_train)

train_ur_pred = lr_ur_fsm.predict(Xu_train[features])
test_ur_pred = lr_ur_fsm.predict(Xu_test[features])


In [53]:
coef = dict(zip(list(features), list(lr_ur_fsm.coef_[0])))
sorted_dict1 = sorted(coef.items(), key=lambda kv: kv[1])
sorted_dict1.reverse()
sorted_dict1

[('engineering', 1.0081761400306952),
 ('compsci', 0.5855631920010246),
 ('S2SSUMMERPRG', 0.5400349565467177),
 ('physics', 0.46697002998696513),
 ('S2SCLUB', 0.4269801910380986),
 ('X2STU30OCC_STEM1', 0.41052662868108736),
 ('S2SCAREER', 0.3783023919453503),
 ('S2SPERSON1', 0.3621059559457294),
 ('S2SPERSON2', 0.28727973894595016),
 ('S2SCOMPETE', 0.2505091232882305),
 ('S2SENJOYS', 0.2442752766198629),
 ('S2SUSELIFE', 0.23609994404322976),
 ('S2SUSEJOB', 0.1882965347763559),
 ('chem', 0.17775478059543903),
 ('S2STEXTBOOK', 0.16624368268472486),
 ('S2SCHALLENGE', 0.16413598932982518),
 ('S2STCHGIVEUP', 0.10207687689172361),
 ('S2STESTS', 0.08145009813289525),
 ('S2STCHEASY', 0.0468534815240275),
 ('S2SCLGSUCC', 0.03890712950751177),
 ('S2SDOWELL', 0.03583388436921967),
 ('S2SBORN', 0.029701418736544114),
 ('enviro', 0.02836626643143689),
 ('S2SASSEXCL', 0.027280977517892047),
 ('S2SPARREC', 0.02525063048057134),
 ('S2STCHTREAT', 0.014490153724937766),
 ('S2SWASTE', 0.01376140762536569

In [282]:
metric_dict['URLogisticRegression'] = {'train_accuracy': metrics.accuracy_score(yu_train, train_ur_pred),
                                      'test_accuracy': metrics.accuracy_score(yu_test, test_ur_pred),
                                      'train_precision':metrics.precision_score(yu_train, train_ur_pred),
                                      'test_precision':metrics.precision_score(yu_test, test_ur_pred),
                                      'train_recall':metrics.recall_score(yu_train, train_ur_pred),
                                      'test_recall':metrics.recall_score(yu_test, test_ur_pred),
                                      'train_f1':metrics.f1_score(yu_train, train_ur_pred),
                                      'test_f1':metrics.f1_score(yu_test, test_ur_pred)}

In [283]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.6653377377833466,
  'test_accuracy': 0.675626423690205,
  'train_precision': 0.37809187279151946,
  'test_precision': 0.3892013498312711,
  'train_recall': 0.6815286624203821,
  'test_recall': 0.6718446601941748,
  'train_f1': 0.48636363636363633,
  'test_f1': 0.4928774928774928},
 'URLogisticRegression': {'train_accuracy': 0.6761429758935993,
  'test_accuracy': 0.660904255319149,
  'train_precision': 0.3169164882226981,
  'test_precision': 0.2895622895622896,
  'train_recall': 0.6770356816102471,
  'test_recall': 0.6615384615384615,
  'train_f1': 0.43173862310385064,
  'test_f1': 0.4028103044496487}}

### Overrep

In [286]:
overrep_df = modeling_df[modeling_df['underrep'] == 0]

X_or = overrep_df.drop(columns = 'target', axis =1)
y_or = overrep_df['target']

Xo_train, Xo_test, yo_train, yo_test = train_test_split(X_or, y_or, test_size = 0.2, random_state = 20)


In [287]:
lr_or_fsm = LogisticRegression(max_iter = 10000, C = 10000, random_state = 20, class_weight = 'balanced')
lr_or_fsm.fit(Xo_train[features], yo_train)

train_or_pred = lr_or_fsm.predict(Xo_train[features])
test_or_pred = lr_or_fsm.predict(Xo_test[features])


In [288]:
coef = dict(zip(list(features), list(lr_or_fsm.coef_[0])))
sorted_dict1 = sorted(coef.items(), key=lambda kv: kv[1])
sorted_dict1.reverse()
sorted_dict1

[('engineering', 0.8991879517434294),
 ('X2STU30OCC_STEM1', 0.7884251810784615),
 ('compsci', 0.7374946980860351),
 ('misc_class', 0.488614000410563),
 ('chem', 0.46008807743137237),
 ('physics', 0.4509259654525843),
 ('S2SUSEJOB', 0.42230533827532146),
 ('S2SCLUB', 0.40804835875837797),
 ('S2SSUMMERPRG', 0.3419779430595667),
 ('S2SCOMPETE', 0.3363554749664277),
 ('S2SCAREER', 0.30012483934301565),
 ('S2SEMPREC', 0.2993959742930527),
 ('S2SPERSON1', 0.26031753136331637),
 ('S2SDOWELL', 0.24095621854593047),
 ('S2SPERSON2', 0.22639342441071664),
 ('S2SENJOYS', 0.21655077610709317),
 ('S2SPARREC', 0.14805476642010468),
 ('enviro', 0.13977292897516425),
 ('S2SCHALLENGE', 0.1169655190176675),
 ('S2SUSECLG', 0.07975729210479916),
 ('S2SENJOYING', 0.07697818480098213),
 ('S2SLEARN', 0.06283512816380846),
 ('bio', 0.035903599414143055),
 ('S2STESTS', 0.02637598237083904),
 ('S2SHSREQ', 0.02482555396136357),
 ('S2SBORN', 0.01503685961630344),
 ('S2STCHRREC', 0.005633699730942853),
 ('S2SFAMREC

In [289]:
metric_dict['ORLogisticRegression'] = {'train_accuracy': metrics.accuracy_score(yo_train, train_or_pred),
                                      'test_accuracy': metrics.accuracy_score(yo_test, test_or_pred),
                                      'train_precision':metrics.precision_score(yo_train, train_or_pred),
                                      'test_precision':metrics.precision_score(yo_test, test_or_pred),
                                      'train_recall':metrics.recall_score(yo_train, train_or_pred),
                                      'test_recall':metrics.recall_score(yo_test, test_or_pred),
                                      'train_f1':metrics.f1_score(yo_train, train_or_pred),
                                      'test_f1':metrics.f1_score(yo_test, test_or_pred)}

In [290]:
metric_dict

{'LogisticRegression': {'train_accuracy': 0.6653377377833466,
  'test_accuracy': 0.675626423690205,
  'train_precision': 0.37809187279151946,
  'test_precision': 0.3892013498312711,
  'train_recall': 0.6815286624203821,
  'test_recall': 0.6718446601941748,
  'train_f1': 0.48636363636363633,
  'test_f1': 0.4928774928774928},
 'URLogisticRegression': {'train_accuracy': 0.6761429758935993,
  'test_accuracy': 0.660904255319149,
  'train_precision': 0.3169164882226981,
  'test_precision': 0.2895622895622896,
  'train_recall': 0.6770356816102471,
  'test_recall': 0.6615384615384615,
  'train_f1': 0.43173862310385064,
  'test_f1': 0.4028103044496487},
 'ORLogisticRegression': {'train_accuracy': 0.6903039073806078,
  'test_accuracy': 0.6845151953690304,
  'train_precision': 0.5498366013071896,
  'test_precision': 0.5121107266435986,
  'train_recall': 0.6881390593047034,
  'test_recall': 0.6577777777777778,
  'train_f1': 0.6112624886466849,
  'test_f1': 0.5758754863813229}}

## Decision Tree - Initial

In [54]:
dtc_all = DecisionTreeClassifier(random_state = 20)

param_dict={'max_depth': range(1,10),
            'criterion': ['gini', 'entropy'],
            'splitter': ['random', 'best'],
            'max_features': ['auto', 'sqrt', 'log2']}

grid_tree=GridSearchCV(dtc_all, 
                       param_dict, 
                       cv=10, 
                       scoring='f1', 
                       verbose=1, 
                       n_jobs=-1)

In [None]:
grid_tree.fit(X_train[features],y_train)