<a href="https://colab.research.google.com/github/aarsanjani/meansquares/blob/master/NY_FeatureRanking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goal:
Using Random Forest to estimate the gini index and feature importance for all features from NY dataset

Using Scikit-learn to compute PCA

## Mount Google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
import numpy as np

## Load data

In [3]:
NY_labeled_data = pd.read_csv('/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/NY_labeledData_Sep13.csv',low_memory=False)
print(NY_labeled_data.shape)

(9690, 19)


In [4]:
NY_labeled_data.dtypes

Date                      object
fips_x                     int64
County Name               object
retail and recreation    float64
grocery and pharmacy     float64
parks                    float64
transit stations         float64
workplaces               float64
residential              float64
driving                  float64
m50                      float64
m50_index                  int64
population_density       float64
mask_rule_active           int64
mask_wearing_percent     float64
New cases                  int64
rolling_avg_new_cases    float64
label                     object
growth_label              object
dtype: object

## Preprocessing data 

* drop na
* convert categorical code

In [5]:
NY_labeled_data['label_code'] = NY_labeled_data['label'].astype('category').cat.codes

NY_labeled_data.head(2)

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases,label,growth_label,label_code
0,2020-03-01,36001,Albany County,10.0,13.0,19.0,4.0,7.0,-1.0,-4.87,2.616,54,584.364958,0,0.0,0,0.0,LessSpread,flat_growth,0
1,2020-03-01,36003,Allegany County,47.0,13.0,0.0,0.0,1.0,0.0,19.81,0.707,12,44.778541,0,0.0,0,0.0,LessSpread,flat_growth,0


In [13]:
NY_labeled_data = NY_labeled_data.dropna()

In [14]:
NY_labeled_data[pd.isna(NY_labeled_data['label'])]

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases,label,growth_label,label_code


In [15]:
NY_labeled_data['label_code'].unique()

array([0, 2, 1], dtype=int8)

In [16]:
NY_labeled_data['growth_code'] = NY_labeled_data['growth_label'].astype('category').cat.codes
NY_labeled_data.head()

Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases,rolling_avg_new_cases,label,growth_label,label_code,growth_code
0,2020-03-01,36001,Albany County,10.0,13.0,19.0,4.0,7.0,-1.0,-4.87,2.616,54,584.364958,0,0.0,0,0.0,LessSpread,flat_growth,0,2
1,2020-03-01,36003,Allegany County,47.0,13.0,0.0,0.0,1.0,0.0,19.81,0.707,12,44.778541,0,0.0,0,0.0,LessSpread,flat_growth,0,2
2,2020-03-01,36123,Yates County,31.0,33.0,0.0,0.0,7.0,0.0,28.14,5.004,72,73.676584,0,0.0,0,0.0,LessSpread,flat_growth,0,2
3,2020-03-01,36119,Westchester County,8.0,-1.0,12.0,1.0,0.0,0.0,-4.34,3.903,59,2247.400697,0,0.0,0,0.0,LessSpread,flat_growth,0,2
4,2020-03-01,36031,Essex County,16.0,11.0,17.0,0.0,1.0,0.0,60.18,12.6,197,20.557565,0,0.0,0,0.0,LessSpread,flat_growth,0,2


In [17]:
NY_labeled_data['growth_code'].unique()

array([2, 1, 3, 4, 0], dtype=int8)

In [18]:
NY_labeled_data.dtypes

Date                      object
fips_x                     int64
County Name               object
retail and recreation    float64
grocery and pharmacy     float64
parks                    float64
transit stations         float64
workplaces               float64
residential              float64
driving                  float64
m50                      float64
m50_index                  int64
population_density       float64
mask_rule_active           int64
mask_wearing_percent     float64
New cases                  int64
rolling_avg_new_cases    float64
label                     object
growth_label              object
label_code                  int8
growth_code                 int8
dtype: object

In [19]:
feature_labels = ['retail and recreation','grocery and pharmacy','parks','transit stations','workplaces','residential','driving','m50','m50_index',
               'population_density','mask_rule_active']

target = ['label_code']

X = NY_labeled_data[feature_labels]
y = NY_labeled_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)


# Train Random Forest classifier

In [21]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)

  """


('retail and recreation', 0.08533237907689088)
('grocery and pharmacy', 0.07432744335920005)
('parks', 0.08480298060160986)
('transit stations', 0.08796250851161055)
('workplaces', 0.09479738178990016)
('residential', 0.06285539885421815)
('driving', 0.13091202907639612)
('m50', 0.09735863140413215)
('m50_index', 0.07388504135209598)
('population_density', 0.16029826261037747)
('mask_rule_active', 0.04746794336356852)


In [22]:

#  Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. 

clf = RandomForestClassifier(n_estimators=10000, random_state=0,criterion='entropy', n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)

  import sys


('retail and recreation', 0.06739541253738059)
('grocery and pharmacy', 0.04860335482970266)
('parks', 0.051090256823234766)
('transit stations', 0.08608358869920833)
('workplaces', 0.06382473110426025)
('residential', 0.054494080882956875)
('driving', 0.16064985960050648)
('m50', 0.06833588120137825)
('m50_index', 0.058118905331399844)
('population_density', 0.20277183002695343)
('mask_rule_active', 0.13863209896301856)
