<a href="https://colab.research.google.com/github/aarsanjani/meansquares/blob/master/TX_baselineAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description


This colab is being used to prepare Texas baseline analysis data for all counties. We are prepapreing here two sets of data. 

* 1. TX_combined data from merging all the listed datasets- 
> * Merge Social Distancing, 
> * mobility data,
> * Covid Cases, 
> * Population Density 
> * Mask data for rule

* 2. Merging Social Distancing Inertia (Maryland data) and Covid data integration

In [None]:
!pip install wget




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls  '/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/'

CA_combined_Sep09.csv
CA-CovidAug22.csv
CA_labeledData_Sep09
CA_sd_inertia_labeledData_Sep09
CA_socialDistancingInertiaData_Sep09.csv
Maryland_University_SocialDistancing_Data
mask_rule.csv
Newyork_combinedAug22.csv
NY_sd_inertia_labeledData_Aug26.csv
NY_socialDistancingInertiaDataAug25.csv
Social_Distancing_Inertia_County.csv
TX_combined_Sep10.csv
TX-Covid_Sep10.csv
TX_socialDistancingInertiaData_Sep10.csv


# Load library

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os
import wget
from pandas import Series, datetime
from pandas.plotting import scatter_matrix, autocorrelation_plot
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_curve, auc
import random
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture
from datetime import datetime

In [None]:
os.environ["TF_CPP_MIN_LOG_LEVEL"]="2"
import warnings; warnings.simplefilter('ignore')

# 1 Load Data Sets

## 1.1 County Population

In [None]:
US_population_filename = wget.download('https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv')
county_population_US = pd.read_csv(US_population_filename,low_memory=False)
print(county_population_US.shape)

county_population_US.head(2)

(3195, 4)


Unnamed: 0,countyFIPS,County Name,State,population
0,0,Statewide Unallocated,AL,0
1,1001,Autauga County,AL,55869


In [None]:
county_population_US.State.unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA',
       'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA',
       'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
       'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'], dtype=object)

In [None]:
CA_population = county_population_US[county_population_US['State'] == 'TX']
print(CA_population.shape)
CA_population.head(2)

(255, 4)


Unnamed: 0,countyFIPS,County Name,State,population
2568,0,Statewide Unallocated,TX,0
2569,48001,Anderson County,TX,57735


In [None]:
CA_population['County Name'].unique()

array(['Statewide Unallocated', 'Anderson County', 'Andrews County',
       'Angelina County', 'Aransas County', 'Archer County',
       'Armstrong County', 'Atascosa County', 'Austin County',
       'Bailey County', 'Bandera County', 'Bastrop County',
       'Baylor County', 'Bee County', 'Bell County', 'Bexar County',
       'Blanco County', 'Borden County', 'Bosque County', 'Bowie County',
       'Brazoria County', 'Brazos County', 'Brewster County',
       'Briscoe County', 'Brooks County', 'Brown County',
       'Burleson County', 'Burnet County', 'Caldwell County',
       'Calhoun County', 'Callahan County', 'Cameron County',
       'Camp County', 'Carson County', 'Cass County', 'Castro County',
       'Chambers County', 'Cherokee County', 'Childress County',
       'Clay County', 'Cochran County', 'Coke County', 'Coleman County',
       'Collin County', 'Collingsworth County', 'Colorado County',
       'Comal County', 'Comanche County', 'Concho County', 'Cooke County',
       'C

In [None]:
LA_county_population = CA_population[CA_population['County Name'] == 'Harris County']
LA_county_population

Unnamed: 0,countyFIPS,County Name,State,population
2669,48201,Harris County,TX,4713325


In [None]:
CA_population[CA_population['County Name'] == 'Dallas County']

Unnamed: 0,countyFIPS,County Name,State,population
2625,48113,Dallas County,TX,2635516


## 1.2 Land Area 

In [None]:
landarea_filename = wget.download('https://raw.githubusercontent.com/ykzeng/covid-19/master/data/census-landarea-all.csv')
landarea_data = pd.read_csv(landarea_filename,low_memory=False)
print(landarea_data.shape)


(3195, 52)


In [None]:
landarea_data.head(2)

Unnamed: 0,fips,PST045212,PST040210,PST120212,POP010210,AGE135212,AGE295212,AGE775212,SEX255212,RHI125212,RHI225212,RHI325212,RHI425212,RHI525212,RHI625212,RHI725212,RHI825212,POP715211,POP645211,POP815211,EDU635211,EDU685211,VET605211,LFE305211,HSG010211,HSG445211,HSG096211,HSG495211,HSD410211,HSD310211,INC910211,INC110211,PVY020211,BZA010211,BZA110211,BZA115211,NES010211,SBO001207,SBO315207,SBO115207,SBO215207,SBO515207,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030212,LND110210,POP060210
0,0,313914040,308747508,1.7,308745538,6.4,23.5,13.7,50.8,77.9,13.1,1.2,5.1,0.2,2.4,16.9,63.0,84.6,12.8,20.3,85.4,28.2,22215303,25.4,132312404,66.1,25.9,186200,114761359,2.6,27915,52762,14.3,7354043,113425965,1.3,22491080,27092908,7.1,0.9,5.7,0.1,8.3,28.8,5319456312,4174286516,3917663456,12990,613795732,829658,3531905.43,87.4
1,1000,4822023,4779745,0.9,4779736,6.3,23.3,14.5,51.5,70.0,26.5,0.7,1.2,0.1,1.5,4.1,66.6,84.5,3.4,5.0,81.9,22.0,403982,24.0,2182088,70.7,15.5,120800,1831269,2.53,23483,42934,17.6,97743,1573138,0.3,321641,382350,14.8,0.8,1.8,0.1,1.2,28.1,112858843,52252752,57344851,12364,6426342,13506,50645.33,94.4


In [None]:
landarea_data['fips'].unique()

array([    0,  1000,  1001, ..., 56041, 56043, 56045])

In [None]:
#LND110210
# Harris county fips code = 48201

landarea_data[landarea_data['fips'] == 48201]['LND110210']

2668    1703.48
Name: LND110210, dtype: float64

#### Population density

In [None]:
density = 2253858/108.53

density

20767.142725513684

LA population density

In [None]:
LA_population_density=pd.merge(LA_county_population,landarea_data,how='inner', left_on=['countyFIPS'], right_on = ['fips'])

In [None]:
LA_population_density['population_density']= LA_population_density['population']/LA_population_density['LND110210']
LA_population_density = LA_population_density[['fips','County Name', 'State', 'population','LND110210','population_density']].copy()
LA_population_density.tail(3)

Unnamed: 0,fips,County Name,State,population,LND110210,population_density
0,48201,Harris County,TX,4713325,1703.48,2766.880151


## 1.3 Mask data

In [None]:
mask_rule_data = pd.read_csv('/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/mask_rule.csv',low_memory=False)
 
mask_rule_data.head()

Unnamed: 0,state_name,Law Date,Masks Required?,Type of Requirement,mask_required_rule,requirement_code
0,Louisiana,2020-07-13,Entire State,Everywhere in public where social distancing i...,0,3
1,Kentucky,2020-07-10,Entire State,Everywhere in public where social distancing i...,0,3
2,West Virginia,2020-07-07,Entire State,All Public Indoor Spaces (where social distanc...,0,1
3,Texas,2020-07-03,Entire State,Everywhere in public where social distancing i...,0,4
4,Kansas,2020-07-03,Entire State,Everywhere in public where social distancing i...,0,3


In [None]:
CA_mask_rule = mask_rule_data[mask_rule_data['state_name'] == 'Texas']
CA_mask_rule

Unnamed: 0,state_name,Law Date,Masks Required?,Type of Requirement,mask_required_rule,requirement_code
3,Texas,2020-07-03,Entire State,Everywhere in public where social distancing i...,0,4


In [None]:
mask_filename = wget.download('https://raw.githubusercontent.com/nytimes/covid-19-data/master/mask-use/mask-use-by-county.csv')
mask_filename

'mask-use-by-county (4).csv'

In [None]:

mask_data = pd.read_csv(mask_filename,low_memory=False)
print(mask_data.shape)

(3142, 6)


In [None]:
mask_data.head()

Unnamed: 0,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS
0,1001,0.053,0.074,0.134,0.295,0.444
1,1003,0.083,0.059,0.098,0.323,0.436
2,1005,0.067,0.121,0.12,0.201,0.491
3,1007,0.02,0.034,0.096,0.278,0.572
4,1009,0.053,0.114,0.18,0.194,0.459


In [None]:
#Harris -48201
mask_data[mask_data['COUNTYFP'] == 8021]

Unnamed: 0,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS
255,8021,0.06,0.106,0.093,0.195,0.545


## 1.4 Mobility Data 

### Google mobility data

In [None]:
google_data_url = 'https://raw.githubusercontent.com/ActiveConclusion/COVID19_mobility/master/google_reports/mobility_report_US.csv'

google_filename = wget.download(google_data_url)
google_mobility_data = pd.read_csv(google_filename,low_memory=False)
google_mobility_data.head(2)

Unnamed: 0,state,county,date,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential
0,Total,Total,2020-02-15,6.0,2.0,15.0,3.0,2.0,-1.0
1,Total,Total,2020-02-16,7.0,1.0,16.0,2.0,0.0,-1.0


In [None]:
CA_GoogleMobility_data = google_mobility_data[google_mobility_data['state'] == 'Texas']
print(CA_GoogleMobility_data.shape)
CA_GoogleMobility_data.tail(2)

(40385, 9)


Unnamed: 0,state,county,date,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential
475074,Texas,Zavala County,2020-09-03,,,,,-33.0,
475075,Texas,Zavala County,2020-09-04,,,,,-37.0,


### Apple mobility data

In [None]:
apple_report_url = 'https://raw.githubusercontent.com/ActiveConclusion/COVID19_mobility/master/apple_reports/apple_mobility_report_US.csv'

apple_filename = wget.download(apple_report_url)
apple_mobility_data = pd.read_csv(apple_filename,low_memory=False)
apple_mobility_data.tail(2)

Unnamed: 0,state,county_and_city,geo_type,date,driving,transit,walking
537820,Wyoming,Uinta County,county,2020-09-07,674.59,,
537821,Wyoming,Uinta County,county,2020-09-08,238.01,,


In [None]:
CA_AppleMobility_data = apple_mobility_data[apple_mobility_data['state'] == 'Texas']
print(CA_AppleMobility_data.shape)
CA_AppleMobility_data.tail(2)

(45756, 7)


Unnamed: 0,state,county_and_city,geo_type,date,driving,transit,walking
475846,Texas,Young County,county,2020-09-07,74.12,,
475847,Texas,Young County,county,2020-09-08,63.41,,


In [None]:
CA_AppleMobility_data['county'] = CA_AppleMobility_data['county_and_city']

In [None]:
CA_AppleMobility_data = CA_AppleMobility_data[['state','county','date','driving','transit','walking']]

CA_AppleMobility_data.tail(2)

Unnamed: 0,state,county,date,driving,transit,walking
475846,Texas,Young County,2020-09-07,74.12,,
475847,Texas,Young County,2020-09-08,63.41,,


In [None]:
CA_mobility_data = pd.merge(CA_GoogleMobility_data,CA_AppleMobility_data,how='outer' ,on=['state','county','date'])
CA_mobility_data.shape

(50724, 12)

In [None]:
CA_mobility_data = CA_mobility_data.fillna(0)
CA_mobility_data.tail()

Unnamed: 0,state,county,date,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,transit,walking
50719,Texas,Young County,2020-02-12,0.0,0.0,0.0,0.0,0.0,0.0,-0.14,0.0,0.0
50720,Texas,Young County,2020-02-13,0.0,0.0,0.0,0.0,0.0,0.0,24.32,0.0,0.0
50721,Texas,Young County,2020-02-14,0.0,0.0,0.0,0.0,0.0,0.0,46.14,0.0,0.0
50722,Texas,Young County,2020-09-07,0.0,0.0,0.0,0.0,0.0,0.0,74.12,0.0,0.0
50723,Texas,Young County,2020-09-08,0.0,0.0,0.0,0.0,0.0,0.0,63.41,0.0,0.0


### Harris county mobility data

In [None]:
LA_mobility_data = CA_mobility_data[CA_mobility_data['county'] == 'Harris County']
print(LA_mobility_data.shape)
LA_mobility_data.tail()

(240, 12)


Unnamed: 0,state,county,date,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,transit,walking
44692,Texas,Harris County,2020-02-12,0.0,0.0,0.0,0.0,0.0,0.0,7.58,-1.35,-5.57
44693,Texas,Harris County,2020-02-13,0.0,0.0,0.0,0.0,0.0,0.0,18.27,0.61,10.87
44694,Texas,Harris County,2020-02-14,0.0,0.0,0.0,0.0,0.0,0.0,45.93,8.8,50.23
44695,Texas,Harris County,2020-09-07,0.0,0.0,0.0,0.0,0.0,0.0,-7.33,-54.87,4.49
44696,Texas,Harris County,2020-09-08,0.0,0.0,0.0,0.0,0.0,0.0,7.13,-46.79,17.99


In [None]:
LA_mobility_data.head()

Unnamed: 0,state,county,date,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,transit,walking
15870,Texas,Harris County,2020-02-15,3.0,2.0,13.0,0.0,2.0,-1.0,24.75,-7.85,48.69
15871,Texas,Harris County,2020-02-16,6.0,1.0,16.0,-2.0,0.0,-1.0,-9.89,-23.36,-2.67
15872,Texas,Harris County,2020-02-17,6.0,2.0,27.0,-3.0,-13.0,3.0,2.31,-3.65,1.37
15873,Texas,Harris County,2020-02-18,2.0,1.0,12.0,3.0,2.0,0.0,5.75,-0.77,-1.42
15874,Texas,Harris County,2020-02-19,1.0,1.0,7.0,1.0,3.0,0.0,5.89,-1.28,-2.01


## 1.5 Social Distancing metric 

In [None]:
socialdistancing_url = 'https://raw.githubusercontent.com/descarteslabs/DL-COVID-19/master/DL-us-mobility-daterow.csv'

socialdistancing_file = wget.download(socialdistancing_url)
socialdistancing_data = pd.read_csv(socialdistancing_file,low_memory=False)
socialdistancing_data.tail(2)



Unnamed: 0,date,country_code,admin_level,admin1,admin2,fips,samples,m50,m50_index
511366,2020-09-06,US,2,Wyoming,Uinta County,56041.0,791,5.016,160
511367,2020-09-07,US,2,Wyoming,Uinta County,56041.0,809,42.501,1357


In [None]:
newyork_socialdistancing_data= socialdistancing_data[(socialdistancing_data['admin1']=='Texas') &
                                           (socialdistancing_data['admin_level']== 2)]

newyork_socialdistancing_data.head()

Unnamed: 0,date,country_code,admin_level,admin1,admin2,fips,samples,m50,m50_index
409980,2020-03-01,US,2,Texas,Anderson County,48001.0,1210,8.91,79
409981,2020-03-02,US,2,Texas,Anderson County,48001.0,1222,10.708,95
409982,2020-03-03,US,2,Texas,Anderson County,48001.0,1228,12.118,107
409983,2020-03-04,US,2,Texas,Anderson County,48001.0,1264,9.156,81
409984,2020-03-05,US,2,Texas,Anderson County,48001.0,1198,11.23,100


In [None]:
LA_social_distancing = newyork_socialdistancing_data[newyork_socialdistancing_data['admin2'] == 'Harris County']
print(LA_social_distancing.shape)
LA_social_distancing.head()

(189, 9)


Unnamed: 0,date,country_code,admin_level,admin1,admin2,fips,samples,m50,m50_index
425456,2020-03-01,US,2,Texas,Harris County,48201.0,84352,6.316,70
425457,2020-03-02,US,2,Texas,Harris County,48201.0,88804,8.717,97
425458,2020-03-03,US,2,Texas,Harris County,48201.0,91067,8.948,100
425459,2020-03-04,US,2,Texas,Harris County,48201.0,93637,8.837,98
425460,2020-03-05,US,2,Texas,Harris County,48201.0,89698,9.644,107


# 2 Load Social Distancing Inertia Data [Maryland University]

In [None]:
maryland_sd_inertia = pd.read_csv('/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/Social_Distancing_Inertia_County.csv',low_memory=False)
maryland_sd_inertia.head()

Unnamed: 0,CTFIPS,CTNAME,STFIPS,Social distancing index,% staying home,Trips/person,% out-of-county trips,% out-of-state trips,Miles/person,Work trips/person,Non-work trips/person,New COVID cases,Population,% change in consumption,date,Transit mode share,% people older than 60,Median income,% African Americans,% Hispanic Americans,% Male,Population density,Employment density,# hot spots/1000 people,Hospital beds/1000 people,ICUs/1000 people,# contact tracing workers/1000 people,# contact tracing workers/1000 people rank,% people older than 60 rank,COVID exposure/1000 people,#days: decreasing ILI cases,Unemployment claims/1000 people,Unemployment rate,% working from home,Cumulative inflation rate,COVID death rate,New cases/1000 people,Active cases/1000 people,#days: decreasing COVID cases,% hospital bed utilization,Testing capacity,Tests done/1000 people,% ICU utilization,Ventilator shortage,Imported COVID cases
0,1001,Autauga County,1,54,27,2.25,38.8,1.2,24.7,0.25,2.0,0,55601,-22.5,1/1/2020,0.11,19,58786,19.0,2.8,48.68,91,19,101,3.79,0.33,0.025,39.0,12,0.0,0,0.9,2.7,3.2,0.3,0.0,0.0,0.0,0,47.1,0.0,0.0,0.0,0,0
1,1001,Autauga County,1,25,18,2.83,43.5,0.9,37.6,0.62,2.21,0,55601,-14.3,1/2/2020,0.11,19,58786,19.0,2.8,48.68,91,19,101,3.79,0.33,0.025,39.0,12,0.0,0,0.9,2.7,3.2,0.3,0.0,0.0,0.0,0,47.1,0.0,0.0,0.0,0,0
2,1001,Autauga County,1,15,15,3.18,42.2,1.0,40.3,0.62,2.57,0,55601,-0.4,1/3/2020,0.11,19,58786,19.0,2.8,48.68,91,19,101,3.79,0.33,0.025,39.0,12,0.0,0,0.9,2.7,3.2,0.3,0.0,0.0,0.0,0,47.1,0.0,0.0,0.0,0,0
3,1001,Autauga County,1,33,20,2.85,43.9,1.2,29.5,0.33,2.52,0,55601,-2.3,1/4/2020,0.11,19,58786,19.0,2.8,48.68,91,19,101,3.79,0.33,0.025,39.0,12,0.0,0,0.9,2.7,3.2,0.3,0.0,0.0,0.0,0,47.1,0.0,0.0,0.0,0,0
4,1001,Autauga County,1,40,22,2.56,42.8,1.2,30.9,0.23,2.33,0,55601,-9.7,1/5/2020,0.11,19,58786,19.0,2.8,48.68,91,19,101,3.79,0.33,0.025,39.0,12,0.0,0,0.9,2.7,3.2,0.3,0.0,0.0,0.0,0,47.1,0.0,0.0,0.0,0,0


## Fixdate column of Social Distancing inertia

In [None]:
def fixDate(x, appendTwenty):
  arr = x.split('/')
  m = arr[0]
  d = arr[1]
  y = arr[2]

  if int(m) < 10:
    m = '0'+str(m)
  if int(d) < 10:
    d = '0'+str(d)
  if appendTwenty == True:
    return '20'+y+'-'+m+'-'+d
  else:
      return y+'-'+m+'-'+d

In [None]:
maryland_sd_inertia['date'] = maryland_sd_inertia['date'].apply(lambda x: fixDate(x,False))
maryland_sd_inertia.reset_index(drop=True)

Unnamed: 0,CTFIPS,CTNAME,STFIPS,Social distancing index,% staying home,Trips/person,% out-of-county trips,% out-of-state trips,Miles/person,Work trips/person,Non-work trips/person,New COVID cases,Population,% change in consumption,date,Transit mode share,% people older than 60,Median income,% African Americans,% Hispanic Americans,% Male,Population density,Employment density,# hot spots/1000 people,Hospital beds/1000 people,ICUs/1000 people,# contact tracing workers/1000 people,# contact tracing workers/1000 people rank,% people older than 60 rank,COVID exposure/1000 people,#days: decreasing ILI cases,Unemployment claims/1000 people,Unemployment rate,% working from home,Cumulative inflation rate,COVID death rate,New cases/1000 people,Active cases/1000 people,#days: decreasing COVID cases,% hospital bed utilization,Testing capacity,Tests done/1000 people,% ICU utilization,Ventilator shortage,Imported COVID cases
0,1001,Autauga County,1,54,27,2.25,38.8,1.2,24.7,0.25,2.00,0,55601,-22.5,2020-01-01,0.11,19,58786,19.0,2.8,48.68,91,19,101,3.79,0.33,0.025,39.0,12,0.0,0,0.9,2.7,3.2,0.30,0.0,0.0000,0.00,0,47.100000,0.0,0.00,0.00,0,0
1,1001,Autauga County,1,25,18,2.83,43.5,0.9,37.6,0.62,2.21,0,55601,-14.3,2020-01-02,0.11,19,58786,19.0,2.8,48.68,91,19,101,3.79,0.33,0.025,39.0,12,0.0,0,0.9,2.7,3.2,0.30,0.0,0.0000,0.00,0,47.100000,0.0,0.00,0.00,0,0
2,1001,Autauga County,1,15,15,3.18,42.2,1.0,40.3,0.62,2.57,0,55601,-0.4,2020-01-03,0.11,19,58786,19.0,2.8,48.68,91,19,101,3.79,0.33,0.025,39.0,12,0.0,0,0.9,2.7,3.2,0.30,0.0,0.0000,0.00,0,47.100000,0.0,0.00,0.00,0,0
3,1001,Autauga County,1,33,20,2.85,43.9,1.2,29.5,0.33,2.52,0,55601,-2.3,2020-01-04,0.11,19,58786,19.0,2.8,48.68,91,19,101,3.79,0.33,0.025,39.0,12,0.0,0,0.9,2.7,3.2,0.30,0.0,0.0000,0.00,0,47.100000,0.0,0.00,0.00,0,0
4,1001,Autauga County,1,40,22,2.56,42.8,1.2,30.9,0.23,2.33,0,55601,-9.7,2020-01-05,0.11,19,58786,19.0,2.8,48.68,91,19,101,3.79,0.33,0.025,39.0,12,0.0,0,0.9,2.7,3.2,0.30,0.0,0.0000,0.00,0,47.100000,0.0,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732081,56037,Sweetwater County,56,19,22,4.62,25.1,10.5,63.0,0.58,4.04,0,43051,37.8,2020-08-20,2.52,17,73008,0.8,16.0,51.87,4,3,134,3.76,0.50,0.017,47.0,6,17.1,105,3.8,12.6,26.2,1.51,0.3,0.0542,2.19,0,37.716682,5.4,111.08,1.31,3,90
732082,56039,Teton County,56,33,21,4.12,33.0,20.7,30.2,0.39,3.73,4,23081,0.8,2020-08-20,4.05,20,83831,1.1,14.9,51.65,5,7,297,3.76,0.50,0.017,47.0,16,17.1,105,3.8,12.6,26.2,1.51,0.3,0.1155,2.19,0,37.716682,5.4,111.08,1.31,3,100
732083,56041,Uinta County,56,21,20,4.44,36.8,24.5,79.0,0.65,3.79,0,20299,52.1,2020-08-20,3.60,18,58235,0.1,9.1,50.97,10,5,147,3.76,0.50,0.017,47.0,9,17.1,105,3.8,12.6,26.2,1.51,0.3,0.0985,2.19,14,37.716682,5.4,111.08,1.31,3,80
732084,56043,Washakie County,56,35,41,3.34,19.0,2.1,59.4,0.85,2.50,2,7885,20.9,2020-08-20,0.03,28,53426,0.0,14.2,50.89,4,3,205,3.76,0.50,0.017,47.0,73,17.1,105,3.8,12.6,26.2,1.51,0.3,0.2536,2.19,5,37.716682,5.4,111.08,1.31,3,7


# 3.Combining CA -all county data 

In [None]:
CA_covidcases = pd.read_csv('/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/TX-Covid_Sep10.csv', low_memory =False)
CA_covidcases.tail()

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
58207,2020-09-04,Texas,90048,0,0,0,0
58208,2020-09-05,Texas,90048,0,0,0,0
58209,2020-09-06,Texas,90048,0,0,0,0
58210,2020-09-07,Texas,90048,0,0,0,0
58211,2020-09-08,Texas,90048,0,0,0,0


In [None]:
#CA_covidcases['Date'] = CA_covidcases['Date'].apply(lambda x: fixDate(x))
CA_covidcases.reset_index(drop=True)
CA_covidcases.head(2)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
0,2020-01-22,Texas,48001,0,0,0,0
1,2020-01-23,Texas,48001,0,0,0,0


In [None]:
CA_covidcases['FIPS'] = CA_covidcases['FIPS'].apply(lambda x :int(x))

In [None]:
CA_covidcases.head(2)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths
0,2020-01-22,Texas,48001,0,0,0,0
1,2020-01-23,Texas,48001,0,0,0,0


In [None]:
CA_covidcases.shape

(58212, 7)

## Combining mask data -TXcounty

In [None]:
CA_covidcases['mask_rule_active'] = CA_covidcases['Date'].apply(lambda x : 1 if x >= CA_mask_rule['Law Date'].values[0] else 0 )


In [None]:
#Mask
CA_mask = pd.merge(CA_covidcases,mask_data,how='inner',left_on=['FIPS'],right_on=['COUNTYFP'])
CA_mask.head(2)


Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths,mask_rule_active,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS
0,2020-01-22,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541
1,2020-01-23,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541


In [None]:
CA_mask.shape

(57981, 14)

In [None]:
def updateMaskWearingPercent(x):
  #print(x)
  if x.mask_rule_active == 1:
    return x.ALWAYS
  else:
    return 0.0

In [None]:
CA_mask['mask_wearing_percent'] = CA_mask.apply(lambda x : updateMaskWearingPercent(x),axis=1)
 
CA_mask.tail()

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths,mask_rule_active,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS,mask_wearing_percent
57976,2020-09-04,Texas,48507,266,12,0,1,1,48507,0.036,0.075,0.115,0.138,0.635,0.635
57977,2020-09-05,Texas,48507,266,13,0,1,1,48507,0.036,0.075,0.115,0.138,0.635,0.635
57978,2020-09-06,Texas,48507,266,13,0,0,1,48507,0.036,0.075,0.115,0.138,0.635,0.635
57979,2020-09-07,Texas,48507,266,13,0,0,1,48507,0.036,0.075,0.115,0.138,0.635,0.635
57980,2020-09-08,Texas,48507,264,13,-2,0,1,48507,0.036,0.075,0.115,0.138,0.635,0.635


In [None]:
CA_mask.head()

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths,mask_rule_active,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS,mask_wearing_percent
0,2020-01-22,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541,0.0
1,2020-01-23,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541,0.0
2,2020-01-24,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541,0.0
3,2020-01-25,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541,0.0
4,2020-01-26,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541,0.0


## Combining population density data

In [None]:
# Pop density
#landarea_data[landarea_data['fips'] == 36081]['LND110210']

CA_landarea_df = pd.merge(CA_mask,landarea_data,how='inner',left_on=['FIPS'],right_on=['fips'])
CA_landarea_df.head(2)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths,mask_rule_active,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS,mask_wearing_percent,fips,PST045212,PST040210,PST120212,POP010210,AGE135212,AGE295212,AGE775212,SEX255212,RHI125212,RHI225212,RHI325212,RHI425212,RHI525212,RHI625212,RHI725212,RHI825212,POP715211,POP645211,POP815211,EDU635211,EDU685211,VET605211,LFE305211,HSG010211,HSG445211,HSG096211,HSG495211,HSD410211,HSD310211,INC910211,INC110211,PVY020211,BZA010211,BZA110211,BZA115211,NES010211,SBO001207,SBO315207,SBO115207,SBO215207,SBO515207,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030212,LND110210,POP060210
0,2020-01-22,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541,0.0,48001,58190,58458,-0.5,58458,5.2,19.3,13.1,39.0,75.3,21.7,0.6,0.8,0.1,1.4,16.5,60.5,82.7,6.2,11.3,78.7,11.5,5014,23.6,20201,71.6,8.8,82900,15992,3.03,18487,40577,18.8,938,11415,1.1,2864,3358,0.0,0.0,2.6,0.0,0.0,0.0,0,365343,495496,8731,48576,3,1062.6,55.0
1,2020-01-23,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541,0.0,48001,58190,58458,-0.5,58458,5.2,19.3,13.1,39.0,75.3,21.7,0.6,0.8,0.1,1.4,16.5,60.5,82.7,6.2,11.3,78.7,11.5,5014,23.6,20201,71.6,8.8,82900,15992,3.03,18487,40577,18.8,938,11415,1.1,2864,3358,0.0,0.0,2.6,0.0,0.0,0.0,0,365343,495496,8731,48576,3,1062.6,55.0


In [None]:
CA_landarea_df.shape

(57981, 67)

In [None]:
CA_landarea = pd.merge(CA_landarea_df,CA_population,how='inner',left_on=['FIPS'],right_on=['countyFIPS'])

CA_landarea.head(2)

Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths,mask_rule_active,COUNTYFP,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS,mask_wearing_percent,fips,PST045212,PST040210,PST120212,POP010210,AGE135212,AGE295212,AGE775212,SEX255212,RHI125212,RHI225212,RHI325212,RHI425212,RHI525212,RHI625212,RHI725212,RHI825212,POP715211,POP645211,POP815211,EDU635211,EDU685211,VET605211,LFE305211,HSG010211,HSG445211,HSG096211,HSG495211,HSD410211,HSD310211,INC910211,INC110211,PVY020211,BZA010211,BZA110211,BZA115211,NES010211,SBO001207,SBO315207,SBO115207,SBO215207,SBO515207,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030212,LND110210,POP060210,countyFIPS,County Name,State,population
0,2020-01-22,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541,0.0,48001,58190,58458,-0.5,58458,5.2,19.3,13.1,39.0,75.3,21.7,0.6,0.8,0.1,1.4,16.5,60.5,82.7,6.2,11.3,78.7,11.5,5014,23.6,20201,71.6,8.8,82900,15992,3.03,18487,40577,18.8,938,11415,1.1,2864,3358,0.0,0.0,2.6,0.0,0.0,0.0,0,365343,495496,8731,48576,3,1062.6,55.0,48001,Anderson County,TX,57735
1,2020-01-23,Texas,48001,0,0,0,0,0,48001,0.172,0.104,0.095,0.088,0.541,0.0,48001,58190,58458,-0.5,58458,5.2,19.3,13.1,39.0,75.3,21.7,0.6,0.8,0.1,1.4,16.5,60.5,82.7,6.2,11.3,78.7,11.5,5014,23.6,20201,71.6,8.8,82900,15992,3.03,18487,40577,18.8,938,11415,1.1,2864,3358,0.0,0.0,2.6,0.0,0.0,0.0,0,365343,495496,8731,48576,3,1062.6,55.0,48001,Anderson County,TX,57735


In [None]:
CA_landarea.shape

(57981, 71)

In [None]:
CA_landarea['population_density']= CA_landarea['population']/CA_landarea['LND110210']
CA_landarea = CA_landarea[['Date','fips','New cases','County Name','State','mask_rule_active','mask_wearing_percent',  'population','LND110210','population_density']].copy()
CA_landarea.tail(3)

Unnamed: 0,Date,fips,New cases,County Name,State,mask_rule_active,mask_wearing_percent,population,LND110210,population_density
57978,2020-09-06,48507,0,Zavala County,TX,1,0.635,11840,1297.41,9.125874
57979,2020-09-07,48507,0,Zavala County,TX,1,0.635,11840,1297.41,9.125874
57980,2020-09-08,48507,-2,Zavala County,TX,1,0.635,11840,1297.41,9.125874


## Combining mobility data 

In [None]:
# Mobility 
CA_mobility_data.head(2)

Unnamed: 0,state,county,date,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,transit,walking
0,Texas,Total,2020-02-15,3.0,3.0,12.0,3.0,1.0,-1.0,32.12,6.25,69.27
1,Texas,Total,2020-02-16,8.0,3.0,20.0,4.0,1.0,-1.0,-0.3,-8.77,17.11


In [None]:
CA_mobility_combined = pd.merge(CA_landarea,CA_mobility_data,how='inner',left_on=['Date','County Name'],right_on=['date','county'])
CA_mobility_combined.head(2)

Unnamed: 0,Date,fips,New cases,County Name,State,mask_rule_active,mask_wearing_percent,population,LND110210,population_density,state,county,date,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,transit,walking
0,2020-01-22,48001,0,Anderson County,TX,0,0.0,57735,1062.6,54.33371,Texas,Anderson County,2020-01-22,0.0,0.0,0.0,0.0,0.0,0.0,1.8,0.0,0.0
1,2020-01-23,48001,0,Anderson County,TX,0,0.0,57735,1062.6,54.33371,Texas,Anderson County,2020-01-23,0.0,0.0,0.0,0.0,0.0,0.0,19.6,0.0,0.0


In [None]:
CA_mobility_combined.shape

(47162, 22)

## Combining social distancing 

In [None]:
newyork_socialdistancing_data.head(2)

Unnamed: 0,date,country_code,admin_level,admin1,admin2,fips,samples,m50,m50_index
409980,2020-03-01,US,2,Texas,Anderson County,48001.0,1210,8.91,79
409981,2020-03-02,US,2,Texas,Anderson County,48001.0,1222,10.708,95


In [None]:
newyork_socialdistancing_data['fips'] = newyork_socialdistancing_data['fips'].apply(lambda x:int(x))
newyork_socialdistancing_data.tail(2)

Unnamed: 0,date,country_code,admin_level,admin1,admin2,fips,samples,m50,m50_index
449269,2020-09-06,US,2,Texas,Zavala County,48507,286,2.128,22
449270,2020-09-07,US,2,Texas,Zavala County,48507,264,3.128,32


In [None]:
CA_mobility_combined.shape

(47162, 22)

In [None]:
CA_socialDistancing_combined = pd.merge(CA_mobility_combined,newyork_socialdistancing_data,how='inner',left_on=['Date','County Name'],right_on=['date','admin2'])
CA_socialDistancing_combined.head(2)

Unnamed: 0,Date,fips_x,New cases,County Name,State,mask_rule_active,mask_wearing_percent,population,LND110210,population_density,state,county,date_x,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,transit,walking,date_y,country_code,admin_level,admin1,admin2,fips_y,samples,m50,m50_index
0,2020-03-01,48001,0,Anderson County,TX,0,0.0,57735,1062.6,54.33371,Texas,Anderson County,2020-03-01,16.0,15.0,0.0,0.0,3.0,0.0,27.49,0.0,0.0,2020-03-01,US,2,Texas,Anderson County,48001,1210,8.91,79
1,2020-03-02,48001,0,Anderson County,TX,0,0.0,57735,1062.6,54.33371,Texas,Anderson County,2020-03-02,14.0,7.0,0.0,0.0,2.0,-1.0,27.76,0.0,0.0,2020-03-02,US,2,Texas,Anderson County,48001,1222,10.708,95


In [None]:
CA_socialDistancing_combined.shape

(36779, 31)

## Filtering columns for feature selection

In [None]:
columns = ['Date','fips_x','County Name','retail and recreation','grocery and pharmacy','parks','transit stations','workplaces','residential','driving','m50','m50_index','population_density','mask_rule_active','mask_wearing_percent','New cases']
CA_All_combined = CA_socialDistancing_combined[columns]

CA_All_combined.head(2)


Unnamed: 0,Date,fips_x,County Name,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent,New cases
0,2020-03-01,48001,Anderson County,16.0,15.0,0.0,0.0,3.0,0.0,27.49,8.91,79,54.33371,0,0.0,0
1,2020-03-02,48001,Anderson County,14.0,7.0,0.0,0.0,2.0,-1.0,27.76,10.708,95,54.33371,0,0.0,0


In [None]:
CA_All_combined.shape

(36779, 16)

### Combined file save-CA

In [None]:
filename = 'TX_combined_'+datetime.now().strftime("%b%d")+'.csv' #%Y%m%d
print(filename)
CA_All_combined.to_csv(filename,index=False)


TX_combined_Sep10.csv


# 4.Merging Social Distancing Inertia (Maryland data) and Covid data integration

In [None]:
CA_covidcases.dtypes

Date                object
Province_State      object
FIPS                 int64
Confirmed            int64
Deaths               int64
New cases            int64
New deaths           int64
mask_rule_active     int64
dtype: object

In [None]:
print(maryland_sd_inertia.shape)
CA_covidcases.shape

(732086, 45)


(58212, 8)

In [None]:
CA_socialDistancingInertiaData = pd.merge(CA_covidcases,maryland_sd_inertia,how='inner',left_on=['FIPS','Date'],right_on=['CTFIPS','date'])
print(CA_socialDistancingInertiaData.shape)

CA_socialDistancingInertiaData.head(2)

(53212, 53)


Unnamed: 0,Date,Province_State,FIPS,Confirmed,Deaths,New cases,New deaths,mask_rule_active,CTFIPS,CTNAME,STFIPS,Social distancing index,% staying home,Trips/person,% out-of-county trips,% out-of-state trips,Miles/person,Work trips/person,Non-work trips/person,New COVID cases,Population,% change in consumption,date,Transit mode share,% people older than 60,Median income,% African Americans,% Hispanic Americans,% Male,Population density,Employment density,# hot spots/1000 people,Hospital beds/1000 people,ICUs/1000 people,# contact tracing workers/1000 people,# contact tracing workers/1000 people rank,% people older than 60 rank,COVID exposure/1000 people,#days: decreasing ILI cases,Unemployment claims/1000 people,Unemployment rate,% working from home,Cumulative inflation rate,COVID death rate,New cases/1000 people,Active cases/1000 people,#days: decreasing COVID cases,% hospital bed utilization,Testing capacity,Tests done/1000 people,% ICU utilization,Ventilator shortage,Imported COVID cases
0,2020-01-22,Texas,48001,0,0,0,0,0,48001,Anderson County,48,15,15,3.31,20.6,0.3,44.4,0.54,2.77,0,58057,-6.2,2020-01-22,0.22,20,43355,21.0,17.5,61.2,54,12,95,2.82,0.3,0.04,26.0,16,0.0,21,0.5,3.5,4.6,0.2,0.0,0.0,0.0,0,47.7,0.0,0.0,0.0,0,0
1,2020-01-23,Texas,48001,0,0,0,0,0,48001,Anderson County,48,14,15,3.41,20.1,0.3,47.7,0.54,2.87,0,58057,-2.9,2020-01-23,0.22,20,43355,21.0,17.5,61.2,54,12,95,2.82,0.3,0.04,26.0,16,0.0,21,0.5,3.5,4.6,0.2,0.0,0.0,0.0,0,47.7,0.0,0.0,0.0,0,0


### Maryland Data file save-CA

In [None]:
filename = 'TX_socialDistancingInertiaData_'+datetime.now().strftime("%b%d")+'.csv' #%Y%m%d
print(filename)
CA_socialDistancingInertiaData.to_csv('/content/drive/Shared drives/CMPE 295- Master Project/Covid19-data/'+filename,index=False)

TX_socialDistancingInertiaData_Sep10.csv


# Feature Selection - Weightage

### Gini index

In [None]:
CA_All_combined = CA_All_combined.fillna(0)

In [None]:
def gini(list_of_values):
    sorted_list = sorted(list_of_values)
    height, area = 0, 0
    for value in sorted_list:
        height += value
        area += height - value / 2.
    fair_area = height * len(list_of_values) / 2.
    return (fair_area - area) / fair_area

In [None]:
# Calling gini-index method

feature_cols=['fips_x','retail and recreation','grocery and pharmacy',
              'parks','transit stations','workplaces','residential','driving',
              'm50','m50_index','population_density','mask_rule_active','mask_wearing_percent']
gini_values = {}
for col in feature_cols:
  dataset = CA_All_combined[col].values.tolist()
  #print(dataset)
  value = gini(dataset)
  print('cols ',col,' - gini coeff is :',value)
  gini_values[col] = value

cols  fips_x  - gini coeff is : 0.001808066230164317
cols  retail and recreation  - gini coeff is : -1.0876275682186483
cols  grocery and pharmacy  - gini coeff is : -8.630562796031356
cols  parks  - gini coeff is : 3.2541891156403038
cols  transit stations  - gini coeff is : -1.431246493668176
cols  workplaces  - gini coeff is : -0.33302310669037083
cols  residential  - gini coeff is : 0.747503022158672
cols  driving  - gini coeff is : 0.950573329872049
cols  m50  - gini coeff is : 0.4584704638409554
cols  m50_index  - gini coeff is : 0.2775535886737779
cols  population_density  - gini coeff is : 0.7661633870623695
cols  mask_rule_active  - gini coeff is : 0.6462655319611735
cols  mask_wearing_percent  - gini coeff is : 0.6836151401216206


In [None]:
sorted_x = sorted(gini_values.items(), key=lambda kv: kv[1])
sorted_x

[('grocery and pharmacy', -8.630562796031356),
 ('transit stations', -1.431246493668176),
 ('retail and recreation', -1.0876275682186483),
 ('workplaces', -0.33302310669037083),
 ('fips_x', 0.001808066230164317),
 ('m50_index', 0.2775535886737779),
 ('m50', 0.4584704638409554),
 ('mask_rule_active', 0.6462655319611735),
 ('mask_wearing_percent', 0.6836151401216206),
 ('residential', 0.747503022158672),
 ('population_density', 0.7661633870623695),
 ('driving', 0.950573329872049),
 ('parks', 3.2541891156403038)]

## Random Forest - Feature selection

In [None]:
CA_All_combined.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fips_x,36779.0,48257.636069,151.167774,48001.0,48127.0,48257.0,48389.0,48507.0
retail and recreation,36779.0,-7.047636,15.069244,-93.0,-14.0,0.0,0.0,187.0
grocery and pharmacy,36779.0,-0.593029,10.689865,-90.0,-3.0,0.0,1.0,83.0
parks,36779.0,2.254466,20.976831,-90.0,0.0,0.0,0.0,329.0
transit stations,36779.0,-5.036543,15.849895,-76.0,-8.0,0.0,0.0,141.0
workplaces,36779.0,-22.994481,13.722043,-89.0,-32.0,-25.0,-15.0,40.0
residential,36779.0,3.839827,5.975973,-29.0,0.0,0.0,8.0,33.0
driving,36779.0,26.919971,47.054811,-78.66,-2.22,21.64,53.83,787.84
m50,36779.0,10.899393,25.749275,0.025,4.465,7.812,12.9455,642.057
m50_index,36779.0,68.254004,48.922672,0.0,46.0,66.0,84.0,1577.0


In [None]:
X_cols = ['fips_x','retail and recreation','grocery and pharmacy','parks',
          'transit stations','workplaces','residential','driving','m50','m50_index',
          'population_density','mask_rule_active','mask_wearing_percent']
Y_cols = ['New cases']

In [None]:
x_train_c = CA_All_combined[X_cols]
y_train_c = CA_All_combined[Y_cols]

In [None]:
x_train_c

Unnamed: 0,fips_x,retail and recreation,grocery and pharmacy,parks,transit stations,workplaces,residential,driving,m50,m50_index,population_density,mask_rule_active,mask_wearing_percent
0,48001,16.0,15.0,0.0,0.0,3.0,0.0,27.49,8.910,79,54.333710,0,0.000
1,48001,14.0,7.0,0.0,0.0,2.0,-1.0,27.76,10.708,95,54.333710,0,0.000
2,48001,21.0,8.0,0.0,0.0,2.0,-2.0,28.24,12.118,107,54.333710,0,0.000
3,48001,12.0,5.0,0.0,0.0,1.0,2.0,20.72,9.156,81,54.333710,0,0.000
4,48001,18.0,13.0,0.0,0.0,3.0,-3.0,47.52,11.230,100,54.333710,0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36774,48507,0.0,0.0,0.0,0.0,-30.0,0.0,0.00,2.292,23,9.125874,1,0.635
36775,48507,0.0,0.0,0.0,0.0,-30.0,0.0,0.00,3.324,34,9.125874,1,0.635
36776,48507,0.0,0.0,0.0,0.0,-31.0,0.0,0.00,7.657,79,9.125874,1,0.635
36777,48507,0.0,0.0,0.0,0.0,-33.0,0.0,0.00,3.479,36,9.125874,1,0.635


In [None]:
from sklearn.ensemble import RandomForestClassifier

# create the random forest with your hyperparameters.
rfc = RandomForestClassifier(n_estimators=150)

# fit the model to start training.
rfc.fit(x_train_c, y_train_c)

# get the importance of the resulting features.
f_importances = rfc.feature_importances_

# create a data frame for visualization.
final_df = pd.DataFrame({"Features": x_train_c.columns, "Importances":f_importances})

final_df.sort_values('Importances', ascending=False)

## Decision tree- feature selection

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

clf = DecisionTreeClassifier()
clf = clf.fit(x_train_c, y_train_c)
f=clf.feature_importances_

final_dfs = pd.DataFrame({"Features": x_train_c.columns, "Importances":f})

final_dfs.sort_values('Importances', ascending=False)

NameError: ignored

## AutoML Implementation

In [None]:
X_cols = ['fips_x','retail and recreation','grocery and pharmacy','parks',
          'transit stations','workplaces','residential','driving','m50','m50_index',
          'population_density','mask_rule_active','mask_wearing_percent']


In [None]:
CA_All_combined['Date'] =  pd.to_datetime(CA_All_combined['Date'])

In [None]:
CA_All_combined.dtypes

In [None]:
X = CA_All_combined[X_cols]
Y = CA_All_combined[['New cases']]
Y.shape

In [None]:
X_train,X_test,Y_train, Y_test = train_test_split(X, Y,test_size=0.2, random_state=1)

In [None]:
num_folds = 12
seed = 7
scoring = 'accuracy'
models = []
models.append(('LR' , LogisticRegression()))
models.append(('LDA' , LinearDiscriminantAnalysis()))
models.append(('KNN' , KNeighborsClassifier()))
models.append(('CART' , DecisionTreeClassifier()))
models.append(('NB' , GaussianNB()))
models.append(('SVM' , SVC()))
models.append(('RF' , RandomForestClassifier(n_estimators=50)))
models.append(('XGBoost', XGBClassifier()))

In [None]:
Y_train.shape

In [None]:
results = []
names = []

for name, model in models:
    clf = model
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    accu_score = accuracy_score(Y_test, Y_pred)
    print(name + ": " + str(accu_score))

# Reference 

* https://medium.com/next-gen-machine-learning/feature-selection-best-methods-for-feature-selection-python-f3536aad5b4a