In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.metrics import plot_confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df =pd.read_csv('../../data/df2015_16.csv')

In [3]:
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', "_")
df.head()

Unnamed: 0,fips,vax_percentage,total_agriculture,total_construction,total_manufacturing,total_wholesale_trade,total_retail_trade,transportation_warehousing_utilities,total_information,total_finance_and_real_estate,...,total_votes,target,state,county_[2],population(2010),land_areami²,water_areami²,total_areami²,latitude,longitude
0,6001,0.962155,3308,39861,80961,20969,75187,39042,24187,47611,...,610764.0,0,CA,Alameda,1510271,739.017,82.311,821.328,37.64808,-121.9133
1,6005,0.870878,195,820,568,165,1884,525,188,523,...,16489.0,1,CA,Amador,38091,594.583,11.373,605.956,38.44355,-120.65385
2,6007,0.902701,3249,4682,5355,1625,12125,2596,1620,4508,...,86711.0,1,CA,Butte,220000,1636.464,40.667,1677.131,39.66595,-121.60191
3,6009,0.8849,496,1662,1326,373,1819,922,201,661,...,21455.0,1,CA,Calaveras,45578,1020.012,16.915,1036.927,38.18784,-120.55511
4,6011,0.997456,2425,356,958,164,620,463,45,375,...,6212.0,1,CA,Colusa,21419,1150.731,5.63,1156.361,39.17773,-122.23756


In [4]:
df.columns

Index(['fips', 'vax_percentage', 'total_agriculture', 'total_construction',
       'total_manufacturing', 'total_wholesale_trade', 'total_retail_trade',
       'transportation_warehousing_utilities', 'total_information',
       'total_finance_and_real_estate',
       'total_scientific_administrative_and_waste_management_services',
       'total_educational_healthcare_and_social_assistance',
       'total_arts_and_entertainment', 'total_other_services',
       'total_public_administration',
       'total_management_business_science_and_arts_occupations',
       'total_service_occupations', 'total_sales_and_office_occupations',
       'total_natural_resources_and_maintenance_occupations',
       'total_production,_transportation,_and_material_moving_occupations',
       'population_of_one_race_total', 'white', 'black_or_african_american',
       'american_indian_and_alaska_native', 'asian_alone',
       'native_hawaiian_and_other_pacific_islander',
       'population_of_one_race_some_oth

In [5]:
df.iloc[0, 29:44]

ed_no_schooling_completed    25753
ed_nursery_school              148
ed_kindergarten                253
ed_1st_grade                   768
ed_2nd_grade                  1936
ed_3rd_grade                  4217
ed_4th_grade                  3130
ed_5th_grade                  4937
ed_6th_grade                 21621
ed_7th_grade                  4847
ed_8th_grade                 10795
ed_9th_grade                 14180
e_10th_grade                 12027
ed_11th_grade                14729
ed_12th_grade_no_diploma     23248
Name: 0, dtype: object

In [6]:
row_sums = []
for row in df.index:
    row_sums.append(df.iloc[row, 29:44].sum())

In [7]:
df['sums'] = row_sums

In [8]:
df['percent_no_hs'] = df['sums']/df['ed_total']

In [9]:
df.head()

Unnamed: 0,fips,vax_percentage,total_agriculture,total_construction,total_manufacturing,total_wholesale_trade,total_retail_trade,transportation_warehousing_utilities,total_information,total_finance_and_real_estate,...,state,county_[2],population(2010),land_areami²,water_areami²,total_areami²,latitude,longitude,sums,percent_no_hs
0,6001,0.962155,3308,39861,80961,20969,75187,39042,24187,47611,...,CA,Alameda,1510271,739.017,82.311,821.328,37.64808,-121.9133,142589,0.130553
1,6005,0.870878,195,820,568,165,1884,525,188,523,...,CA,Amador,38091,594.583,11.373,605.956,38.44355,-120.65385,3337,0.116369
2,6007,0.902701,3249,4682,5355,1625,12125,2596,1620,4508,...,CA,Butte,220000,1636.464,40.667,1677.131,39.66595,-121.60191,17192,0.120145
3,6009,0.8849,496,1662,1326,373,1819,922,201,661,...,CA,Calaveras,45578,1020.012,16.915,1036.927,38.18784,-120.55511,2812,0.083569
4,6011,0.997456,2425,356,958,164,620,463,45,375,...,CA,Colusa,21419,1150.731,5.63,1156.361,39.17773,-122.23756,4231,0.318216


In [10]:
df['percent_no_hs']

0      0.130553
1      0.116369
2      0.120145
3      0.083569
4      0.318216
         ...   
474    0.192152
475    0.163808
476    0.212963
477    0.141601
478    0.467203
Name: percent_no_hs, Length: 479, dtype: float64

In [11]:
df.head()


Unnamed: 0,fips,vax_percentage,total_agriculture,total_construction,total_manufacturing,total_wholesale_trade,total_retail_trade,transportation_warehousing_utilities,total_information,total_finance_and_real_estate,...,state,county_[2],population(2010),land_areami²,water_areami²,total_areami²,latitude,longitude,sums,percent_no_hs
0,6001,0.962155,3308,39861,80961,20969,75187,39042,24187,47611,...,CA,Alameda,1510271,739.017,82.311,821.328,37.64808,-121.9133,142589,0.130553
1,6005,0.870878,195,820,568,165,1884,525,188,523,...,CA,Amador,38091,594.583,11.373,605.956,38.44355,-120.65385,3337,0.116369
2,6007,0.902701,3249,4682,5355,1625,12125,2596,1620,4508,...,CA,Butte,220000,1636.464,40.667,1677.131,39.66595,-121.60191,17192,0.120145
3,6009,0.8849,496,1662,1326,373,1819,922,201,661,...,CA,Calaveras,45578,1020.012,16.915,1036.927,38.18784,-120.55511,2812,0.083569
4,6011,0.997456,2425,356,958,164,620,463,45,375,...,CA,Colusa,21419,1150.731,5.63,1156.361,39.17773,-122.23756,4231,0.318216


In [12]:
 25753+148+253+768+1936+4217+3130+4937+21621+4847+10795+14180+12027+14729+23248

142589

In [13]:
df.iloc[0,:].to_list()

[6001,
 0.9621547064881564,
 3308,
 39861,
 80961,
 20969,
 75187,
 39042,
 24187,
 47611,
 133336,
 174656,
 70809,
 40358,
 27847,
 362591,
 126123,
 167052,
 50316,
 72050,
 1487408,
 708558,
 184883,
 9813,
 439055,
 13760,
 131339,
 37.1,
 1092189,
 25753,
 148,
 253,
 768,
 1936,
 4217,
 3130,
 4937,
 21621,
 4847,
 10795,
 14180,
 12027,
 14729,
 23248,
 34052.0,
 70100,
 848232,
 13753,
 68307,
 73763,
 1573351,
 2911,
 1584983,
 2016,
 'PRESIDENT',
 'CA',
 514842.0,
 95922.0,
 610764.0,
 0,
 'CA',
 'Alameda',
 1510271,
 739.0169999999998,
 82.311,
 821.328,
 37.64808,
 -121.9133,
 142589,
 0.13055341154324024]

In [14]:
df.columns

Index(['fips', 'vax_percentage', 'total_agriculture', 'total_construction',
       'total_manufacturing', 'total_wholesale_trade', 'total_retail_trade',
       'transportation_warehousing_utilities', 'total_information',
       'total_finance_and_real_estate',
       'total_scientific_administrative_and_waste_management_services',
       'total_educational_healthcare_and_social_assistance',
       'total_arts_and_entertainment', 'total_other_services',
       'total_public_administration',
       'total_management_business_science_and_arts_occupations',
       'total_service_occupations', 'total_sales_and_office_occupations',
       'total_natural_resources_and_maintenance_occupations',
       'total_production,_transportation,_and_material_moving_occupations',
       'population_of_one_race_total', 'white', 'black_or_african_american',
       'american_indian_and_alaska_native', 'asian_alone',
       'native_hawaiian_and_other_pacific_islander',
       'population_of_one_race_some_oth

In [15]:
#df = df.drop(['ed_total', 'ed_no_schooling_completed', 'ed_nursery_school','ed_kindergarten', 'ed_1st_grade', 'ed_2nd_grade', 'ed_3rd_grade','ed_4th_grade', 'ed_5th_grade', 
#'ed_6th_grade', 'ed_7th_grade','ed_8th_grade', 'ed_9th_grade', 'e_10th_grade', 'ed_11th_grade',
#'ed_12th_grade_no_diploma', 'sums', 'fips'], axis=1)

In [16]:
#df = df.drop(['office','state','dem_votes','rep_votes', 'year','latitude','longitude'], axis=1)

In [17]:
'ed_nursery_school, 
'ed_kindergarten,
'ed_1st_grade, 
'ed_2nd_grade,
'ed_3rd_grade,
'ed_4th_grade, 
'ed_5th_grade,
'ed_6th_grade, 
'ed_7th_grade,
'ed_8th_grade, 
'ed_9th_grade,  
'e_10th_grade, 
'ed_11th_grade, 
'ed_12th_grade_no_diploma, 

SyntaxError: EOL while scanning string literal (<ipython-input-17-405ba9e03b3b>, line 1)

In [18]:
df['unemp_rate'] = df['unemployed_pop_16_years_and_over']/df['in_civilian_labor_force']

In [19]:
insurance_sums = []
for row in df.index:
    insurance_sums.append(df.iloc[row, 47:51].sum())

In [20]:
df['insurance_sums'] = insurance_sums

In [21]:
df['uninsured_rate'] = df['insurance_sums']/df['health_insurance_total']

In [22]:
population_of_one_race_total', 'white', 'black_or_african_american',
       'american_indian_and_alaska_native', 'asian_alone',
       'native_hawaiian_and_other_pacific_islander',

SyntaxError: invalid syntax (<ipython-input-22-0199ba716a2f>, line 1)

In [23]:
df['percent_white'] = df['white']/df['population_of_one_race_total']

In [24]:
df['percent_black'] = df['black_or_african_american']/df['population_of_one_race_total']

In [25]:
df['percent_american_indian_and_alaska_native'] = df['american_indian_and_alaska_native']/df['population_of_one_race_total']

In [26]:
df['percent_pacific_islander'] = df['native_hawaiian_and_other_pacific_islander']/df['population_of_one_race_total']

In [27]:
df.columns

Index(['fips', 'vax_percentage', 'total_agriculture', 'total_construction',
       'total_manufacturing', 'total_wholesale_trade', 'total_retail_trade',
       'transportation_warehousing_utilities', 'total_information',
       'total_finance_and_real_estate',
       'total_scientific_administrative_and_waste_management_services',
       'total_educational_healthcare_and_social_assistance',
       'total_arts_and_entertainment', 'total_other_services',
       'total_public_administration',
       'total_management_business_science_and_arts_occupations',
       'total_service_occupations', 'total_sales_and_office_occupations',
       'total_natural_resources_and_maintenance_occupations',
       'total_production,_transportation,_and_material_moving_occupations',
       'population_of_one_race_total', 'white', 'black_or_african_american',
       'american_indian_and_alaska_native', 'asian_alone',
       'native_hawaiian_and_other_pacific_islander',
       'population_of_one_race_some_oth

In [28]:
df = df.drop(['unemployed_pop_16_years_and_over','in_civilian_labor_force','ed_total', 'ed_no_schooling_completed', 'ed_nursery_school','ed_kindergarten', 'ed_1st_grade', 
'ed_2nd_grade', 'ed_3rd_grade','ed_4th_grade', 'ed_5th_grade'],axis=1)


In [32]:
df = df.drop(['ed_6th_grade', 'ed_7th_grade','ed_8th_grade', 'ed_9th_grade', 'e_10th_grade', 'ed_11th_grade'],axis=1)

In [34]:
df.columns

Index(['fips', 'vax_percentage', 'total_agriculture', 'total_construction',
       'total_manufacturing', 'total_wholesale_trade', 'total_retail_trade',
       'transportation_warehousing_utilities', 'total_information',
       'total_finance_and_real_estate',
       'total_scientific_administrative_and_waste_management_services',
       'total_educational_healthcare_and_social_assistance',
       'total_arts_and_entertainment', 'total_other_services',
       'total_public_administration',
       'total_management_business_science_and_arts_occupations',
       'total_service_occupations', 'total_sales_and_office_occupations',
       'total_natural_resources_and_maintenance_occupations',
       'total_production,_transportation,_and_material_moving_occupations',
       'population_of_one_race_total', 'white', 'black_or_african_american',
       'american_indian_and_alaska_native', 'asian_alone',
       'native_hawaiian_and_other_pacific_islander',
       'population_of_one_race_some_oth

In [36]:
df = df.drop(['year','latitude','longitude','population_of_one_race_total', 'white', 'black_or_african_american',
'american_indian_and_alaska_native', 'asian_alone','native_hawaiian_and_other_pacific_islander',
'population_of_one_race_some_other_race'], axis=1)

In [37]:
df.columns

Index(['fips', 'vax_percentage', 'total_agriculture', 'total_construction',
       'total_manufacturing', 'total_wholesale_trade', 'total_retail_trade',
       'transportation_warehousing_utilities', 'total_information',
       'total_finance_and_real_estate',
       'total_scientific_administrative_and_waste_management_services',
       'total_educational_healthcare_and_social_assistance',
       'total_arts_and_entertainment', 'total_other_services',
       'total_public_administration',
       'total_management_business_science_and_arts_occupations',
       'total_service_occupations', 'total_sales_and_office_occupations',
       'total_natural_resources_and_maintenance_occupations',
       'total_production,_transportation,_and_material_moving_occupations',
       'median_age:_total', 'ed_12th_grade_no_diploma', 'median_income',
       'under_19_no_health_insurance_coverage',
       '19_to_34_years_no_health_insurance_coverage',
       '35_to_64_years_no_health_insurance_coverage',

In [39]:
df = df.drop(['fips'], axis=1)

In [40]:
df = df.drop(['ed_12th_grade_no_diploma','sums'], axis=1)

In [43]:
df =df.drop(['insurance_sums','office','state'],axis=1)

In [44]:
df.columns

Index(['vax_percentage', 'total_agriculture', 'total_construction',
       'total_manufacturing', 'total_wholesale_trade', 'total_retail_trade',
       'transportation_warehousing_utilities', 'total_information',
       'total_finance_and_real_estate',
       'total_scientific_administrative_and_waste_management_services',
       'total_educational_healthcare_and_social_assistance',
       'total_arts_and_entertainment', 'total_other_services',
       'total_public_administration',
       'total_management_business_science_and_arts_occupations',
       'total_service_occupations', 'total_sales_and_office_occupations',
       'total_natural_resources_and_maintenance_occupations',
       'total_production,_transportation,_and_material_moving_occupations',
       'median_age:_total', 'median_income',
       'under_19_no_health_insurance_coverage',
       '19_to_34_years_no_health_insurance_coverage',
       '35_to_64_years_no_health_insurance_coverage', 'health_insurance_total',
       '6

In [46]:
df.to_csv('../../data/df1516features.csv')

In [45]:
df.target

0      0
1      1
2      1
3      1
4      1
      ..
474    1
475    1
476    1
477    1
478    0
Name: target, Length: 479, dtype: int64

In [None]:
dfnum.corr()

In [None]:
sns.heatmap(dfnum.corr())

In [None]:
# Increase the size of the heatmap.
plt.figure(figsize=(16, 6))
# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(dfbest.corr(), vmin=-1, vmax=1, annot=True)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
feature_list = df.columns

In [None]:
df.columns

In [None]:
dfbest= df[['vax_percentage', 'total_agriculture', 'total_construction',
       'total_manufacturing',
       'total_service_occupations', 'total_sales_and_office_occupations',
       'total_natural_resources_and_maintenance_occupations',
       'total_production,_transportation,_and_material_moving_occupations',
       'population_of_one_race_total','median_income',
       'unemployed_pop_16_years_and_over', 'in_civilian_labor_force',
       'under_19_no_health_insurance_coverage',
       '19_to_34_years_no_health_insurance_coverage',
       '35_to_64_years_no_health_insurance_coverage', 'health_insurance_total',
       '65_years_and_over_no_health_insurance_coverage',
       'total_population_estimate',
     'total_votes', 'target',
       'population(2010)', 'land_areami²', 'water_areami²', 'total_areami²',
       'latitude', 'longitude' ]]

In [None]:
# Increase the size of the heatmap.
plt.figure(figsize=(16, 6))
# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(dfbest.corr(), vmin=-1, vmax=1, annot=True)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
dfnum = df.select_dtypes(exclude=['object'])

In [None]:
#'ed_nursery_school','ed_kindergarten','ed_1st_grade','ed_2nd_grade','ed_3rd_grade','ed_4th_grade','ed_5th_grade'        

In [None]:
X = dfnum.drop('target', axis=1)
y = df['target']

In [None]:
#first train-test-split
X_t, X_test, y_t, y_test = train_test_split(X,y, random_state=42)

In [None]:
#second train-test-split for cross val
X_train, X_val, y_train, y_val = train_test_split(X_t,y_t, random_state=42)

In [None]:
#Handling missing data using 
imputer = SimpleImputer()

In [None]:
#fitting on training data
imputer.fit(X_train)
#transform training and validation data
X_train_imputed = imputer.transform(X_train)
X_val_imputed = imputer.transform(X_val)

In [None]:
#turned imputed data into dataframes
X_train_imputed_df = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_val_imputed_df = pd.DataFrame(X_val_imputed, columns=X_val.columns)

In [None]:
#scaling data
scaler = StandardScaler()

In [None]:
scaler.fit(X_train_imputed_df)

In [None]:
#transform train and validation data 
X_train_scaled = scaler.transform(X_train_imputed_df)
X_val_scaled = scaler.transform(X_val_imputed_df)

In [None]:
#create dataframes of scaled train and val data
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)

In [None]:
#sanity check
X_train_scaled_df.head()

In [None]:
#sanity check again
X_val_scaled_df.head()

In [None]:
#create a function to build the various models needed in the remainder of this notebook

def modeling_function(model, X_train, y_train, X_val, y_val):
    #fit model to training data
    model.fit(X_train_scaled_df, y_train)
    
    #make predictions on train and validation
    train_preds = model.predict(X_train_scaled_df)
    val_preds = model.predict(X_val_scaled_df)
    
    #output metrics
    print(model, ': Training Recall', recall_score(y_train, train_preds))
    print(model, ': Validation Recall', recall_score(y_val, val_preds))
    print('Training Accuracy', accuracy_score(y_train, train_preds))
    print('Validation Accuracy', accuracy_score(y_val, val_preds))
    print('Training Precicion', precision_score(y_train, train_preds))
    print('Validation Precision', precision_score(y_val, val_preds))
    print('Training F1', f1_score(y_train, train_preds))
    print('Validation F1', f1_score(y_val, val_preds))
    
    #return fitted model
    return model

In [None]:
#instantiate log reg
logreg = LogisticRegression(solver ='liblinear')

In [None]:
#run logreg through function to fit and model
logreg_fitted = modeling_function(logreg, X_train_scaled_df, y_train, X_val_scaled_df, y_val)

In [None]:
logreg_coefs = pd.DataFrame(np.abs(logreg.coef_.T), columns=['coefficient'], index=X_train_scaled_df.columns)
logreg_coefs.sort_values(by='coefficient')

In [None]:
plot_confusion_matrix(logreg_fitted, X_train_scaled_df, y_train)

In [None]:
plot_confusion_matrix(logreg_fitted, X_val_scaled_df, y_val)

In [None]:
logreg2 = LogisticRegression(solver='liblinear', penalty='l1', C=.5)

In [None]:
logreg2_fitted = modeling_function(logreg2, X_train_scaled_df, y_train, X_val_scaled_df, y_val)

In [None]:
logreg2_coefs = pd.DataFrame(np.abs(logreg2.coef_.T), columns=['coefficient'], index=X_train_scaled_df.columns)
logreg2_coefs.sort_values(by='coefficient')

In [None]:
dtc = DecisionTreeClassifier(max_depth=5, max_leaf_nodes=2)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [None]:
rfc_fitted = modeling_function(rfc, X_train_scaled_df, y_train, X_val_scaled_df, y_val)

In [None]:
dtc_fitted = modeling_function(dtc, X_train_scaled_df, y_train, X_val_scaled_df, y_val)

In [None]:
plot_confusion_matrix(dtc_fitted, X_train_scaled_df, y_train)

In [None]:
plot_confusion_matrix(dtc_fitted, X_val_scaled_df, y_val)

In [None]:
#create param_grid dictionary
param_grid = {'criterion': ['gini','entropy'],
                'max_leaf_nodes': [2,3,4,5,6,7,8,9,10,11,12,13,14],
                'max_depth': [2,3,4,5,6,7,8,9,10]}

In [None]:
dtc_gs = GridSearchCV(estimator=dtc, param_grid=param_grid, scoring='recall')

In [None]:
dtc_gs.fit(X_train_scaled_df, y_train)

In [None]:
best_model = dtc_gs.best_estimator_
best_model

In [None]:
X_test_imputed = imputer.transform(X_test)

In [None]:
X_test_scaled = scaler.transform(X_test_imputed)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()

In [None]:
fitted_knn = modeling_function(knn, X_train_scaled_df, y_train, X_val_scaled_df, y_val)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scaled_pipeline_1 = Pipeline([('si', SimpleImputer()),
                              ('ss', StandardScaler()), 
                              ('knn', KNeighborsClassifier())])

In [None]:
scaled_pipeline_1.fit(X_train, y_train)

In [None]:
scaled_pipeline_1.score(X_val, y_val)

In [None]:
scaled_pipeline_1.score(X_test, y_test)

In [None]:
scaled_pipeline_2 = Pipeline([('si', SimpleImputer()),
                              ('ss', StandardScaler()), 
                              ('logreg', LogisticRegression())])

In [None]:
scaled_pipeline_2.fit(X_train, y_train)

In [None]:
scaled_pipeline_2.score(X_val, y_val)

In [None]:
scaled_pipeline_1.score(X_test, y_test)

In [None]:
scaled_pipeline_3 = Pipeline([('si', SimpleImputer()),
                              ('ss', StandardScaler()), 
                              ('dtc', DecisionTreeClassifier())])

In [None]:
scaled_pipeline_3.fit(X_train, y_train)

In [None]:
scaled_pipeline_3.score(X_val, y_val)

In [None]:
scaled_pipeline_3.score(X_test, y_test)

In [None]:
feature_list = X_train_scaled_df.columns

In [None]:
from sklearn.feature_selection import RFE
scores = []
for num in range(1, X_train_scaled.shape[1]):
    selector = RFE(logreg, n_features_to_select=num, step=1)
    selector = selector.fit(X_train_scaled, y_train)
    score = selector.score(X_val_scaled, y_val)
    scores.append(score)
    selector = RFE(logreg, n_features_to_select=1, step=1)
    selector = selector.fit(X_train_scaled, y_train)
    score = selector.score(X_val_scaled, y_val)
    feature_importance = dict(zip(feature_list,selector.ranking_))

In [None]:
feature_importance

In [None]:
dtc_best_model = dtc_gs.best_estimator_
dtc_best_model

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
                           n_features=10,
                           n_informative=3,
                           n_redundant=0,
                           n_repeated=0,
                           n_classes=2,
                           random_state=0,
                           shuffle=False)


rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train_scaled, y_train)
print (CV_rfc.best_params_)

In [None]:
rfcbest = RandomForestClassifier(max_features= 'auto', n_estimators=200)

In [None]:
 rfc_fitted = modeling_function(logreg, X_train_scaled_df, y_train, X_val_scaled_df, y_val)