<a href="https://colab.research.google.com/github/aarsanjani/meansquares/blob/master/version2/feature_engineering/CA_FeatureRanking_Gini_SupersetData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goal:
Using Random Forest to estimate the gini index and feature importance for all features from CA **Superset** dataset.
Details: 

>* **Gini-Coefficient:** Used the code from our machine learning. Uses CA combined data and Maryland data for gini coefficient score
>* **Random Forest (Gini Score):** Random classifier needs a target value. Hence, *CA-CDC labeled data* is used to get feature importance score. **Target: Growth_label_Code**
>* **Extra Tree Classifier (Gini Score):** ET classifier needs a target value. Hence *CA-CDC labeled data* is used to get feature importance score. **Target: Growth_label_Code**



## Mount Google drive

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler as ss
import itertools
from scipy import linalg
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
import numpy as np
import matplotlib.pyplot as plt

# 1.Load data - Combined CA Data

## 1.1Preprocessed Data

In [51]:
CA_superset_data = pd.read_csv('/content/drive/Shared drives/CMPE 295- Master Project/projectdata-2021/CA_Superset_dataset_Feb12.csv',low_memory=False)
CA_superset_data.head()

Unnamed: 0,Date,Province_State,FIPS,New cases,County Name,mask_rule_severity,m50,m50_index,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,PopDensity,State_Name,State,CTFIPS,CTNAME,STFIPS,Social.distancing.index,X..staying.home,Trips.person,X..out.of.county.trips,X..out.of.state.trips,Miles.person,Work.trips.person,Non.work.trips.person,New.COVID.cases,Population,X..change.in.consumption,Transit.mode.share,X..people.older.than.60,Median.income,X..African.Americans,X..Hispanic.Americans,X..Male,Employment.density,X..hot.spots.1000.people,Hospital.beds.1000.people,ICUs.1000.people,X..contact.tracing.workers.1000.people,New.cases.1000.people,Active.cases.1000.people,X.days..decreasing.COVID.cases,X..hospital.bed.utilization,Testing.capacity,Tests.done.1000.people,X..ICU.utilization,Ventilator.shortage,Imported.COVID.cases,COVID.exposure.1000.people,X.days..decreasing.ILI.cases,Unemployment.claims.1000.people,Unemployment.rate,X..working.from.home,Cumulative.inflation.rate,COVID.death.rate
0,2020-03-01,California,6001,0,Alameda County,0.0,3.684,58,6.0,9.0,15.0,1.0,4.0,0.0,860.757075,California,CA,6001,Alameda County,6,37,26,2.93,25.4,0.4,36.8,0.25,2.68,0,1666753,-5.1,15.04,19,92574,10.5,22.5,49.11,1155,131,2.05,0.22,,0.0,0.0,37,54.0,0.0,0.0,0.0,0,0,0.05,35,1.1,5.3,5.7,0.6,0.0
1,2020-03-02,California,6001,0,Alameda County,0.0,6.182,98,4.0,7.0,7.0,-1.0,3.0,0.0,860.757075,California,CA,6001,Alameda County,6,17,20,3.38,28.2,0.4,38.1,0.63,2.74,0,1666753,-2.9,15.04,19,92574,10.5,22.5,49.11,1155,131,2.05,0.22,,0.0,0.0,38,54.0,0.0,0.0,0.0,0,0,0.05,35,1.1,5.3,5.7,0.6,0.0
2,2020-03-03,California,6001,1,Alameda County,0.0,6.271,100,5.0,12.0,41.0,0.0,1.0,-1.0,860.757075,California,CA,6001,Alameda County,6,16,18,3.61,27.1,0.3,32.7,0.67,2.94,1,1666753,4.1,15.04,19,92574,10.5,22.5,49.11,1155,131,2.05,0.22,,0.0002,0.0,0,54.0,0.0,0.0,0.0,0,1,0.05,42,1.1,5.3,5.7,0.6,0.0
3,2020-03-04,California,6001,0,Alameda County,0.0,6.209,99,1.0,6.0,23.0,-3.0,1.0,0.0,860.757075,California,CA,6001,Alameda County,6,14,18,3.55,27.9,0.3,36.9,0.68,2.87,0,1666753,1.7,15.04,19,92574,10.5,22.5,49.11,1155,131,2.05,0.22,,0.0002,0.0,0,54.0,10.3,0.01,0.0,0,1,0.06,42,1.1,5.3,5.7,0.6,0.04
4,2020-03-05,California,6001,0,Alameda County,0.0,6.654,106,1.0,6.0,12.0,-5.0,-1.0,1.0,860.757075,California,CA,6001,Alameda County,6,15,19,3.54,28.1,0.3,36.8,0.66,2.88,0,1666753,2.0,15.04,19,92574,10.5,22.5,49.11,1155,131,2.05,0.22,,0.0002,0.0,0,54.0,10.3,0.01,0.0,0,1,0.06,42,1.1,5.3,5.7,0.6,0.04


In [53]:
CA_superset_data.shape

(15391, 58)

In [54]:
CA_superset_data.dtypes

Date                                                   object
Province_State                                         object
FIPS                                                    int64
New cases                                               int64
County Name                                            object
mask_rule_severity                                    float64
m50                                                   float64
m50_index                                               int64
retail_and_recreation_percent_change_from_baseline    float64
grocery_and_pharmacy_percent_change_from_baseline     float64
parks_percent_change_from_baseline                    float64
transit_stations_percent_change_from_baseline         float64
workplaces_percent_change_from_baseline               float64
residential_percent_change_from_baseline              float64
PopDensity                                            float64
State_Name                                             object
State   

## 1.2Labeled  Data

In [None]:
CA_labeled_data = pd.read_csv('/content/drive/Shared drives/CMPE 295- Master Project/projectdata-2021/CA_CombinedData_Labeled_CDCGuideline_Jan15.csv',low_memory=False)
CA_labeled_data.head()

Unnamed: 0,Date,Province_State,FIPS,County Name,population,mask_rule_active,m50,m50_index,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,New Cases/100k population,rolling_avg_new_cases/100k,New cases,growth_label
0,2020-03-01,California,6001,Alameda County,1671329,0,3.684,58,6.0,9.0,15.0,1.0,4.0,0.0,0.0,0.0,0.0,minimal
1,2020-03-02,California,6001,Alameda County,1671329,0,6.182,98,4.0,7.0,7.0,-1.0,3.0,0.0,0.0,0.0,0.0,minimal
2,2020-03-03,California,6001,Alameda County,1671329,0,6.271,100,5.0,12.0,41.0,0.0,1.0,-1.0,0.059833,0.0,1.0,minimal
3,2020-03-04,California,6001,Alameda County,1671329,0,6.209,99,1.0,6.0,23.0,-3.0,1.0,0.0,0.0,0.0,0.0,minimal
4,2020-03-05,California,6001,Alameda County,1671329,0,6.654,106,1.0,6.0,12.0,-5.0,-1.0,1.0,0.0,0.0,0.0,minimal


In [None]:
CA_labeled_data.dtypes

Date                                                   object
Province_State                                         object
FIPS                                                    int64
County Name                                            object
population                                              int64
mask_rule_active                                        int64
m50                                                   float64
m50_index                                               int64
retail_and_recreation_percent_change_from_baseline    float64
grocery_and_pharmacy_percent_change_from_baseline     float64
parks_percent_change_from_baseline                    float64
transit_stations_percent_change_from_baseline         float64
workplaces_percent_change_from_baseline               float64
residential_percent_change_from_baseline              float64
New Cases/100k population                             float64
rolling_avg_new_cases/100k                            float64
New case

In [None]:
CA_labeled_data.shape

(15821, 18)

## Preprocessing data 

* drop na
* convert categorical code

In [29]:
#CA_superset_data = CA_superset_data.dropna()

In [55]:
CA_superset_data.dtypes

Date                                                   object
Province_State                                         object
FIPS                                                    int64
New cases                                               int64
County Name                                            object
mask_rule_severity                                    float64
m50                                                   float64
m50_index                                               int64
retail_and_recreation_percent_change_from_baseline    float64
grocery_and_pharmacy_percent_change_from_baseline     float64
parks_percent_change_from_baseline                    float64
transit_stations_percent_change_from_baseline         float64
workplaces_percent_change_from_baseline               float64
residential_percent_change_from_baseline              float64
PopDensity                                            float64
State_Name                                             object
State   

In [56]:
CA_superset_data.shape

(15391, 58)

## Group the data

In [57]:
CA_superset_data.columns

Index(['Date', 'Province_State', 'FIPS', 'New cases', 'County Name',
       'mask_rule_severity', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline', 'PopDensity', 'State_Name',
       'State', 'CTFIPS', 'CTNAME', 'STFIPS', 'Social.distancing.index',
       'X..staying.home', 'Trips.person', 'X..out.of.county.trips',
       'X..out.of.state.trips', 'Miles.person', 'Work.trips.person',
       'Non.work.trips.person', 'New.COVID.cases', 'Population',
       'X..change.in.consumption', 'Transit.mode.share',
       'X..people.older.than.60', 'Median.income', 'X..African.Americans',
       'X..Hispanic.Americans', 'X..Male', 'Employment.density',
       'X..hot.spots.1000.people', 'Hospital.beds.1000.peop

In [58]:
x_col_demographics = ['PopDensity','X..people.older.than.60',
                      'Median.income', 'X..African.Americans',
                      'X..Hispanic.Americans', 'X..Male','New cases','New.cases.1000.people']

In [59]:
x_col_hospitalization_testing = ['Hospital.beds.1000.people',
       'ICUs.1000.people', 
       'X..hospital.bed.utilization',
       'Testing.capacity', 'Tests.done.1000.people', 'X..ICU.utilization',
       'Ventilator.shortage','New cases','New.cases.1000.people'] #'X..contact.tracing.workers.1000.people',

In [60]:
x_col_mobility = ['retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline','parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline','workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline',
       'X..staying.home', 'Trips.person', 'X..out.of.county.trips',
       'X..out.of.state.trips', 'Miles.person', 'Work.trips.person',
       'Non.work.trips.person','X..working.from.home','New cases','New.cases.1000.people']

In [61]:
x_col_socialdistancing = ['m50', 'm50_index','Social.distancing.index','New cases','New.cases.1000.people']

In [62]:
x_col_maskrule = ['mask_rule_severity','New cases','New.cases.1000.people']

# PCA: Demographic Group

## Feature Importance: Gini-Coefficient

In [63]:
CA_superset_data.columns

Index(['Date', 'Province_State', 'FIPS', 'New cases', 'County Name',
       'mask_rule_severity', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline', 'PopDensity', 'State_Name',
       'State', 'CTFIPS', 'CTNAME', 'STFIPS', 'Social.distancing.index',
       'X..staying.home', 'Trips.person', 'X..out.of.county.trips',
       'X..out.of.state.trips', 'Miles.person', 'Work.trips.person',
       'Non.work.trips.person', 'New.COVID.cases', 'Population',
       'X..change.in.consumption', 'Transit.mode.share',
       'X..people.older.than.60', 'Median.income', 'X..African.Americans',
       'X..Hispanic.Americans', 'X..Male', 'Employment.density',
       'X..hot.spots.1000.people', 'Hospital.beds.1000.peop

In [48]:
def gini(list_of_values):
    sorted_list = sorted(list_of_values)
    #print (sorted_list)
    height, area = 0, 0
    for value in sorted_list:
        
        height += value
        area += height - value / 2.
    
    fair_area = height * len(list_of_values) / 2.
    print(fair_area)
    return (fair_area - area) / fair_area

In [64]:
x_col_demographics = ['PopDensity','X..people.older.than.60',
                      'Median.income', 'X..African.Americans',
                      'X..Hispanic.Americans', 'X..Male','New cases','New.cases.1000.people']

In [65]:
feature_cols= x_col_demographics

gini_values = []
for col in feature_cols:
  #print(col)
  dataset = CA_superset_data[col].values.tolist()
  #print(dataset)
  value = gini(dataset)
  print('cols ',col,' - gini coeff is :',value)
  gini_values.append(value)

35575489453.94788
cols  PopDensity  - gini coeff is : 0.8156637311902494
2688977001.0
cols  X..people.older.than.60  - gini coeff is : 0.14673423679461214
7791742870376.5
cols  Median.income  - gini coeff is : 0.15433249671121319
374657726.1500031
cols  X..African.Americans  - gini coeff is : 0.46161607629775014
3797987049.249782
cols  X..Hispanic.Americans  - gini coeff is : 0.3070973510639532
5971969723.955034
cols  X..Male  - gini coeff is : 0.017576291485370876
14339402229.5
cols  New cases  - gini coeff is : 0.8685321834670556
16852079.173250053
cols  New.cases.1000.people  - gini coeff is : 0.680904683056212


## Feature Importance: Random Forest classifier(Gini Score)

In [None]:
CA_labeled_data.columns

Index(['Date', 'Province_State', 'FIPS', 'County Name', 'population',
       'mask_rule_active', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline', 'New Cases/100k population',
       'rolling_avg_new_cases/100k', 'New cases', 'growth_label'],
      dtype='object')

In [None]:
CA_labeled_data['growth_code'] = CA_labeled_data['growth_label'].astype('category').cat.codes

CA_labeled_data.head(2)

Unnamed: 0,Date,Province_State,FIPS,County Name,population,mask_rule_active,m50,m50_index,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,New Cases/100k population,rolling_avg_new_cases/100k,New cases,growth_label,growth_code
0,2020-03-01,California,6001,Alameda County,1671329,0,3.684,58,6.0,9.0,15.0,1.0,4.0,0.0,0.0,0.0,0.0,minimal,0
1,2020-03-02,California,6001,Alameda County,1671329,0,6.182,98,4.0,7.0,7.0,-1.0,3.0,0.0,0.0,0.0,0.0,minimal,0


In [None]:
feature_labels = ['FIPS','population',
       'mask_rule_active', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline']

target_column = ['growth_code']

X = CA_labeled_data[feature_labels]
y = CA_labeled_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)


In [None]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)

  """


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)


In [None]:

#  Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. 
clf = RandomForestClassifier(n_estimators=10000, random_state=0,criterion='gini', n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)

  


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)


## Feature Importance: Extra tree classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE

In [None]:
feature_labels = ['FIPS','population',
       'mask_rule_active', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline']

target_column = ['growth_code']

X = CA_labeled_data[feature_labels]
y = CA_labeled_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)


#  Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. 
clf = RandomForestClassifier(n_estimators=10000, random_state=0,criterion='gini', n_jobs=-1)# Train the classifier
clf.fit(X_train, y_train)
# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)


model = ExtraTreesClassifier()
model.fit(X, y.values.ravel())
# display the relative importance of each attribute
#print(model.feature_importances_)


# create the RFE model 
rfe = RFE(model, 3)
rfe = rfe.fit(X, y.values.ravel())
# summarize the selection of the attributes
#print(rfe.support_)

print("\nFeatures with ranking:\n")
for index in range(len(rfe.ranking_)):
  print(feature_labels[index], rfe.ranking_[index])

# for index in range(len(rfe.ranking_)):
#   print( rfe.ranking_[index])
#print(rfe.ranking_)
#print(rfe.n_features_)
#rfe.n_features_to_select

  """


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)


  


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)

Features with ranking:

FIPS 9
population 2
mask_rule_active 1
m50 1
m50_index 4
retail_and_recreation_percent_change_from_baseline 1
grocery_and_pharmacy_percent_change_from_baseline 7
parks_percent_change_from_baseline 5
transit_stations_percent_change_from_baseline 8
workplaces_percent_change_from_baseline 3
residential_percent_change_from_baseline 6


# PCA: Hospitalization and testing Group

## Feature Importance: Gini-Coefficient

In [66]:
CA_superset_data.columns

Index(['Date', 'Province_State', 'FIPS', 'New cases', 'County Name',
       'mask_rule_severity', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline', 'PopDensity', 'State_Name',
       'State', 'CTFIPS', 'CTNAME', 'STFIPS', 'Social.distancing.index',
       'X..staying.home', 'Trips.person', 'X..out.of.county.trips',
       'X..out.of.state.trips', 'Miles.person', 'Work.trips.person',
       'Non.work.trips.person', 'New.COVID.cases', 'Population',
       'X..change.in.consumption', 'Transit.mode.share',
       'X..people.older.than.60', 'Median.income', 'X..African.Americans',
       'X..Hispanic.Americans', 'X..Male', 'Employment.density',
       'X..hot.spots.1000.people', 'Hospital.beds.1000.peop

In [67]:
def gini(list_of_values):
    sorted_list = sorted(list_of_values)
    #print (sorted_list)
    height, area = 0, 0
    for value in sorted_list:
        
        height += value
        area += height - value / 2.
    
    fair_area = height * len(list_of_values) / 2.
    print(fair_area)
    return (fair_area - area) / fair_area

In [68]:
x_col_hospitalization_testing = ['Hospital.beds.1000.people',
       'ICUs.1000.people', 
       'X..hospital.bed.utilization',
       'Testing.capacity', 'Tests.done.1000.people', 'X..ICU.utilization',
       'Ventilator.shortage','New cases','New.cases.1000.people'] #'X..contact.tracing.workers.1000.people',

In [69]:
feature_cols= x_col_hospitalization_testing

gini_values = []
for col in feature_cols:
  #print(col)
  dataset = CA_superset_data[col].values.tolist()
  #print(dataset)
  value = gini(dataset)
  print('cols ',col,' - gini coeff is :',value)
  gini_values.append(value)

242804953.02494016
cols  Hospital.beds.1000.people  - gini coeff is : -8.628750104779691e-14
26057116.909992337
cols  ICUs.1000.people  - gini coeff is : -2.3818190088626247e-13
6956397389.468079
cols  X..hospital.bed.utilization  - gini coeff is : 0.03308998365992187
847893268.2000415
cols  Testing.capacity  - gini coeff is : 0.22260662795532127
27711428164.375595
cols  Tests.done.1000.people  - gini coeff is : 0.525695711903369
1850321257.0899777
cols  X..ICU.utilization  - gini coeff is : 0.2995730960210415
74971523352.5
cols  Ventilator.shortage  - gini coeff is : 0.35019564195802766
14339402229.5
cols  New cases  - gini coeff is : 0.8685321834670556
16852079.173250053
cols  New.cases.1000.people  - gini coeff is : 0.680904683056212


## Feature Importance: Random Forest classifier(Gini Score)

In [None]:
CA_labeled_data.columns

Index(['Date', 'Province_State', 'FIPS', 'County Name', 'population',
       'mask_rule_active', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline', 'New Cases/100k population',
       'rolling_avg_new_cases/100k', 'New cases', 'growth_label'],
      dtype='object')

In [None]:
CA_labeled_data['growth_code'] = CA_labeled_data['growth_label'].astype('category').cat.codes

CA_labeled_data.head(2)

Unnamed: 0,Date,Province_State,FIPS,County Name,population,mask_rule_active,m50,m50_index,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,New Cases/100k population,rolling_avg_new_cases/100k,New cases,growth_label,growth_code
0,2020-03-01,California,6001,Alameda County,1671329,0,3.684,58,6.0,9.0,15.0,1.0,4.0,0.0,0.0,0.0,0.0,minimal,0
1,2020-03-02,California,6001,Alameda County,1671329,0,6.182,98,4.0,7.0,7.0,-1.0,3.0,0.0,0.0,0.0,0.0,minimal,0


In [None]:
feature_labels = ['FIPS','population',
       'mask_rule_active', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline']

target_column = ['growth_code']

X = CA_labeled_data[feature_labels]
y = CA_labeled_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)


In [None]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)

  """


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)


In [None]:

#  Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. 
clf = RandomForestClassifier(n_estimators=10000, random_state=0,criterion='gini', n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)

  


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)


## Feature Importance: Extra tree classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE

In [None]:
feature_labels = ['FIPS','population',
       'mask_rule_active', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline']

target_column = ['growth_code']

X = CA_labeled_data[feature_labels]
y = CA_labeled_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)


#  Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. 
clf = RandomForestClassifier(n_estimators=10000, random_state=0,criterion='gini', n_jobs=-1)# Train the classifier
clf.fit(X_train, y_train)
# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)


model = ExtraTreesClassifier()
model.fit(X, y.values.ravel())
# display the relative importance of each attribute
#print(model.feature_importances_)


# create the RFE model 
rfe = RFE(model, 3)
rfe = rfe.fit(X, y.values.ravel())
# summarize the selection of the attributes
#print(rfe.support_)

print("\nFeatures with ranking:\n")
for index in range(len(rfe.ranking_)):
  print(feature_labels[index], rfe.ranking_[index])

# for index in range(len(rfe.ranking_)):
#   print( rfe.ranking_[index])
#print(rfe.ranking_)
#print(rfe.n_features_)
#rfe.n_features_to_select

  """


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)


  


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)

Features with ranking:

FIPS 9
population 2
mask_rule_active 1
m50 1
m50_index 4
retail_and_recreation_percent_change_from_baseline 1
grocery_and_pharmacy_percent_change_from_baseline 7
parks_percent_change_from_baseline 5
transit_stations_percent_change_from_baseline 8
workplaces_percent_change_from_baseline 3
residential_percent_change_from_baseline 6


# PCA: Mobility Group

## Feature Importance: Gini-Coefficient

In [70]:
CA_superset_data.columns

Index(['Date', 'Province_State', 'FIPS', 'New cases', 'County Name',
       'mask_rule_severity', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline', 'PopDensity', 'State_Name',
       'State', 'CTFIPS', 'CTNAME', 'STFIPS', 'Social.distancing.index',
       'X..staying.home', 'Trips.person', 'X..out.of.county.trips',
       'X..out.of.state.trips', 'Miles.person', 'Work.trips.person',
       'Non.work.trips.person', 'New.COVID.cases', 'Population',
       'X..change.in.consumption', 'Transit.mode.share',
       'X..people.older.than.60', 'Median.income', 'X..African.Americans',
       'X..Hispanic.Americans', 'X..Male', 'Employment.density',
       'X..hot.spots.1000.people', 'Hospital.beds.1000.peop

In [71]:
def gini(list_of_values):
    sorted_list = sorted(list_of_values)
    #print (sorted_list)
    height, area = 0, 0
    for value in sorted_list:
        
        height += value
        area += height - value / 2.
    
    fair_area = height * len(list_of_values) / 2.
    print(fair_area)
    return (fair_area - area) / fair_area

In [72]:
x_col_mobility = ['retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline','parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline','workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline',
       'X..staying.home', 'Trips.person', 'X..out.of.county.trips',
       'X..out.of.state.trips', 'Miles.person', 'Work.trips.person',
       'Non.work.trips.person','X..working.from.home','New cases','New.cases.1000.people']

In [73]:
feature_cols= x_col_mobility

gini_values = []
for col in feature_cols:
  #print(col)
  dataset = CA_superset_data[col].values.tolist()
  #print(dataset)
  value = gini(dataset)
  print('cols ',col,' - gini coeff is :',value)
  gini_values.append(value)

nan
cols  retail_and_recreation_percent_change_from_baseline  - gini coeff is : nan
nan
cols  grocery_and_pharmacy_percent_change_from_baseline  - gini coeff is : nan
nan
cols  parks_percent_change_from_baseline  - gini coeff is : nan
nan
cols  transit_stations_percent_change_from_baseline  - gini coeff is : nan
nan
cols  workplaces_percent_change_from_baseline  - gini coeff is : nan
nan
cols  residential_percent_change_from_baseline  - gini coeff is : nan
3185659962.0
cols  X..staying.home  - gini coeff is : 0.1173911693215423
357092208.7150005
cols  Trips.person  - gini coeff is : 0.07230888148726836
2776322465.0999947
cols  X..out.of.county.trips  - gini coeff is : 0.24785794461926924
204607954.00000012
cols  X..out.of.state.trips  - gini coeff is : 0.7522604458475726
4698066581.149977
cols  Miles.person  - gini coeff is : 0.18458958295302125
44415963.439996496
cols  Work.trips.person  - gini coeff is : 0.1805772735479392
312679323.47500366
cols  Non.work.trips.person  - gini coeff 

## Feature Importance: Random Forest classifier(Gini Score)

In [None]:
CA_labeled_data.columns

Index(['Date', 'Province_State', 'FIPS', 'County Name', 'population',
       'mask_rule_active', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline', 'New Cases/100k population',
       'rolling_avg_new_cases/100k', 'New cases', 'growth_label'],
      dtype='object')

In [None]:
CA_labeled_data['growth_code'] = CA_labeled_data['growth_label'].astype('category').cat.codes

CA_labeled_data.head(2)

Unnamed: 0,Date,Province_State,FIPS,County Name,population,mask_rule_active,m50,m50_index,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,New Cases/100k population,rolling_avg_new_cases/100k,New cases,growth_label,growth_code
0,2020-03-01,California,6001,Alameda County,1671329,0,3.684,58,6.0,9.0,15.0,1.0,4.0,0.0,0.0,0.0,0.0,minimal,0
1,2020-03-02,California,6001,Alameda County,1671329,0,6.182,98,4.0,7.0,7.0,-1.0,3.0,0.0,0.0,0.0,0.0,minimal,0


In [None]:
feature_labels = ['FIPS','population',
       'mask_rule_active', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline']

target_column = ['growth_code']

X = CA_labeled_data[feature_labels]
y = CA_labeled_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)


In [None]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)

  """


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)


In [None]:

#  Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. 
clf = RandomForestClassifier(n_estimators=10000, random_state=0,criterion='gini', n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)

  


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)


## Feature Importance: Extra tree classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE

In [None]:
feature_labels = ['FIPS','population',
       'mask_rule_active', 'm50', 'm50_index',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline']

target_column = ['growth_code']

X = CA_labeled_data[feature_labels]
y = CA_labeled_data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)


#  Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. 
clf = RandomForestClassifier(n_estimators=10000, random_state=0,criterion='gini', n_jobs=-1)# Train the classifier
clf.fit(X_train, y_train)
# Print the name and gini importance of each feature
for feature in zip(feature_labels, clf.feature_importances_):
    print(feature)


model = ExtraTreesClassifier()
model.fit(X, y.values.ravel())
# display the relative importance of each attribute
#print(model.feature_importances_)


# create the RFE model 
rfe = RFE(model, 3)
rfe = rfe.fit(X, y.values.ravel())
# summarize the selection of the attributes
#print(rfe.support_)

print("\nFeatures with ranking:\n")
for index in range(len(rfe.ranking_)):
  print(feature_labels[index], rfe.ranking_[index])

# for index in range(len(rfe.ranking_)):
#   print( rfe.ranking_[index])
#print(rfe.ranking_)
#print(rfe.n_features_)
#rfe.n_features_to_select

  """


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)


  


('FIPS', 0.05127801096902735)
('population', 0.081372238181775)
('mask_rule_active', 0.2358482619800189)
('m50', 0.0909342137286611)
('m50_index', 0.0828869306919277)
('retail_and_recreation_percent_change_from_baseline', 0.09855786271336092)
('grocery_and_pharmacy_percent_change_from_baseline', 0.07420155484923242)
('parks_percent_change_from_baseline', 0.06930484250145207)
('transit_stations_percent_change_from_baseline', 0.06075885210040801)
('workplaces_percent_change_from_baseline', 0.08547987336877201)
('residential_percent_change_from_baseline', 0.06937735891536431)

Features with ranking:

FIPS 9
population 2
mask_rule_active 1
m50 1
m50_index 4
retail_and_recreation_percent_change_from_baseline 1
grocery_and_pharmacy_percent_change_from_baseline 7
parks_percent_change_from_baseline 5
transit_stations_percent_change_from_baseline 8
workplaces_percent_change_from_baseline 3
residential_percent_change_from_baseline 6
