In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# EDA
1. combine daily datasets into a dictionary to concat together as a whole df to manage as a whole.
<br><br>
2. extract only the necessary columns to be used in prediction
      'Province_State', 
      'Last_Update', 
      'Confirmed', 
      'Deaths', 
      'Recovered', 
      'Active', 
      'Incident_Rate',
      'Mortality_Rate',
      'People_Tested',
      'Testing_Rate',
      'Hospitalization_Rate'
      
      To reduce space and time, I will use columns which I feel are MOST important: 
      'Province_State', 'Last_Update', 'Confirmed', 'Deaths', 'Recovered', 'Active',
      'People_Tested', 'Testing_Rate', 'Mortality_Rate', 'Incident_Rate'


In [2]:
import os

csv_list = []
# append csv file names into an empty list to be iterated through afterwards
for files in sorted(os.listdir("csse_covid19_daily_us/.")):
    if files.endswith(".csv"):
        csv_list.append(files)
        
# check if it appended correctly to csv_list
print(len(csv_list))
csv_list[:5]

100


['04-12-2020.csv',
 '04-13-2020.csv',
 '04-14-2020.csv',
 '04-15-2020.csv',
 '04-16-2020.csv']

In [3]:
# iterate through csv_list and read_csv > set dates as dict keys > daily data as vals
feature_list = ['Province_State', 'Last_Update', 'Confirmed', 'Deaths', 
                'Recovered', 'Active', 'People_Tested', 'Testing_Rate', 
                'Mortality_Rate', 'Incident_Rate']
 
# datetime_idx = pd.DatetimeIndex([filename[0:-4] for filename in csv_list])
daily_states_dict = {}

for i in range(len(csv_list)):
    csv_str = 'csse_covid19_daily_us/'+csv_list[i]
    today_df = pd.read_csv(csv_str, sep=',')
    filter_df = today_df[feature_list]
    daily_states_dict[i] = filter_df

if len(daily_states_dict) == 100:
    print("Appended all 100 days and corresponding dataframes.")
else:
    print("Loop didn't work correctly.")

Appended all 100 days and corresponding dataframes.


In [4]:
df = pd.concat(daily_states_dict, axis=0)
df

Unnamed: 0,Unnamed: 1,Province_State,Last_Update,Confirmed,Deaths,Recovered,Active,People_Tested,Testing_Rate,Mortality_Rate,Incident_Rate
0,0,Alabama,2020-04-12 23:18:15,3563,93,,3470.0,21583.0,460.300152,2.610160,75.988020
0,1,Alaska,2020-04-12 23:18:15,272,8,66.0,264.0,8038.0,1344.711576,2.941176,45.504049
0,2,Arizona,2020-04-12 23:18:15,3542,115,,3427.0,42109.0,578.522286,3.246753,48.662422
0,3,Arkansas,2020-04-12 23:18:15,1280,27,367.0,1253.0,19722.0,761.753354,2.109375,49.439423
0,4,California,2020-04-12 23:18:15,22795,640,,22155.0,190328.0,485.423868,2.812020,58.137726
...,...,...,...,...,...,...,...,...,...,...,...
99,53,Virginia,2020-07-21 04:38:59,78375,2031,10107.0,66237.0,920461.0,10783.890236,2.591388,918.221845
99,54,Washington,2020-07-21 04:38:59,47743,1453,,46290.0,809339.0,10628.369959,3.043378,626.968757
99,55,West Virginia,2020-07-21 04:38:59,5084,100,3466.0,1518.0,234980.0,13111.647649,1.966955,283.682086
99,56,Wisconsin,2020-07-21 04:38:59,43018,846,33130.0,9042.0,783866.0,13462.857630,1.966619,738.831904


In [5]:
# remove muti-indexing; later will groupby dates
df_reidx = df.reset_index(drop=True)
df_reidx

Unnamed: 0,Province_State,Last_Update,Confirmed,Deaths,Recovered,Active,People_Tested,Testing_Rate,Mortality_Rate,Incident_Rate
0,Alabama,2020-04-12 23:18:15,3563,93,,3470.0,21583.0,460.300152,2.610160,75.988020
1,Alaska,2020-04-12 23:18:15,272,8,66.0,264.0,8038.0,1344.711576,2.941176,45.504049
2,Arizona,2020-04-12 23:18:15,3542,115,,3427.0,42109.0,578.522286,3.246753,48.662422
3,Arkansas,2020-04-12 23:18:15,1280,27,367.0,1253.0,19722.0,761.753354,2.109375,49.439423
4,California,2020-04-12 23:18:15,22795,640,,22155.0,190328.0,485.423868,2.812020,58.137726
...,...,...,...,...,...,...,...,...,...,...
5813,Virginia,2020-07-21 04:38:59,78375,2031,10107.0,66237.0,920461.0,10783.890236,2.591388,918.221845
5814,Washington,2020-07-21 04:38:59,47743,1453,,46290.0,809339.0,10628.369959,3.043378,626.968757
5815,West Virginia,2020-07-21 04:38:59,5084,100,3466.0,1518.0,234980.0,13111.647649,1.966955,283.682086
5816,Wisconsin,2020-07-21 04:38:59,43018,846,33130.0,9042.0,783866.0,13462.857630,1.966619,738.831904


### Clean up the concatenated dataframe**


In [6]:
# first, reset index with dates by re-formatting date column and groupby dates
df_reidx['date'] = pd.to_datetime(df_reidx['Last_Update']).dt.strftime('%Y-%m-%d')
df_reidx.drop('Last_Update', axis=1, inplace=True)
df_reidx
#df.set_index('date', inplace=True) doesn't groupby them, but sets date as index

Unnamed: 0,Province_State,Confirmed,Deaths,Recovered,Active,People_Tested,Testing_Rate,Mortality_Rate,Incident_Rate,date
0,Alabama,3563,93,,3470.0,21583.0,460.300152,2.610160,75.988020,2020-04-12
1,Alaska,272,8,66.0,264.0,8038.0,1344.711576,2.941176,45.504049,2020-04-12
2,Arizona,3542,115,,3427.0,42109.0,578.522286,3.246753,48.662422,2020-04-12
3,Arkansas,1280,27,367.0,1253.0,19722.0,761.753354,2.109375,49.439423,2020-04-12
4,California,22795,640,,22155.0,190328.0,485.423868,2.812020,58.137726,2020-04-12
...,...,...,...,...,...,...,...,...,...,...
5813,Virginia,78375,2031,10107.0,66237.0,920461.0,10783.890236,2.591388,918.221845,2020-07-21
5814,Washington,47743,1453,,46290.0,809339.0,10628.369959,3.043378,626.968757,2020-07-21
5815,West Virginia,5084,100,3466.0,1518.0,234980.0,13111.647649,1.966955,283.682086,2020-07-21
5816,Wisconsin,43018,846,33130.0,9042.0,783866.0,13462.857630,1.966619,738.831904,2020-07-21


In [21]:
# first look at one day to see the significance of features

rand_df = df_reidx[df_reidx['date']=='2020-05-13']
rand_df_bystate = rand_df.set_index('Province_State')
#rand_df_bystateT = rand_df_bystate.T
rand_df_bystate.drop('date', axis=1, inplace=True)
rand_df_bystate.drop(['Grand Princess', 'Diamond Princess'], axis=0, inplace=True)
rand_df_bystate['Recovered'] = rand_df_bystate['Recovered'].fillna(0)
rand_df_bystate['Mortality_Rate'] = rand_df_bystate['Mortality_Rate'].fillna(0)
rand_df_bystate.head()
# Confirmed, Recovered, Active, Deaths are most robust way to determine risk ranges/
    # in determining risk ranges of Incident_rate

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,People_Tested,Testing_Rate,Mortality_Rate,Incident_Rate
Province_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,10464,435,0.0,10029.0,133218.0,2841.137265,4.15711,223.165491
Alaska,383,10,334.0,39.0,29961.0,5012.304496,2.610966,64.073717
American Samoa,0,0,0.0,0.0,105.0,188.709764,0.0,0.0
Arizona,11736,562,2909.0,8265.0,122842.0,1687.687542,4.788684,161.237207
Arkansas,4164,95,3220.0,849.0,70444.0,2720.867724,2.28146,160.832622


In [22]:
df_reidx[df_reidx['date'] == '2020-05-14'].head()

Unnamed: 0,Province_State,Confirmed,Deaths,Recovered,Active,People_Tested,Testing_Rate,Mortality_Rate,Incident_Rate,date
1816,Alabama,10700,450,,10250.0,136372.0,2908.402552,4.205607,228.198657,2020-05-14
1817,Alaska,383,10,338.0,35.0,30649.0,5127.402973,2.610966,64.073717,2020-05-14
1818,American Samoa,0,0,,0.0,105.0,188.709764,,0.0,2020-05-14
1819,Arizona,12216,595,2979.0,8642.0,127750.0,1755.117008,4.870661,167.831776,2020-05-14
1820,Arkansas,4236,97,3220.0,919.0,73215.0,2827.896349,2.289896,163.613589,2020-05-14


    Looking over two consequtive days' value changes, it makes more sense to deal with 
    val diff each day, and then use the avg of value changes (for y as well)

##### Manage missing values
    - for dates, fillna(method='ffill') - depending on row placement of the state within the df

    - other feature values: confirmed, deaths, active fill with 0
    
    - if there are nulls in other rest feature vals, look over and fill with 0

In [23]:
# check count and dropna where most values are null
print(df_reidx.info(), '\n')
print("number of NaNs: ", df_reidx.isnull().sum().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5818 entries, 0 to 5817
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province_State  5818 non-null   object 
 1   Confirmed       5818 non-null   int64  
 2   Deaths          5818 non-null   int64  
 3   Recovered       4391 non-null   float64
 4   Active          5801 non-null   float64
 5   People_Tested   5600 non-null   float64
 6   Testing_Rate    5600 non-null   float64
 7   Mortality_Rate  5700 non-null   float64
 8   Incident_Rate   5600 non-null   float64
 9   date            5799 non-null   object 
dtypes: float64(6), int64(2), object(2)
memory usage: 454.7+ KB
None 

number of NaNs:  2235


In [31]:
# first, change to easily accessible ones
df_reidx.columns = df_reidx.columns.str.lower()
df_reidx.rename(columns={'province_state': 'state'}, inplace=True)


In [40]:
filter_df.reset_index(inplace=True)
filter_df[filter_df.state == 'Diamond Princess']

Unnamed: 0,state,confirmed,deaths,recovered,active,people_tested,testing_rate,mortality_rate,incident_rate,date


In [45]:
# drop cruise ships & other not relevant
df_reidx.state.unique()

filter_df = df_reidx.set_index('state')
filter_df = filter_df.drop(['Grand Princess', 'Diamond Princess', 'Recovered'], axis=0)
filter_df.reset_index(inplace=True)
filter_df.head()

Unnamed: 0,state,confirmed,deaths,recovered,active,people_tested,testing_rate,mortality_rate,incident_rate,date
0,Alabama,3563,93,,3470.0,21583.0,460.300152,2.61016,75.98802,2020-04-12
1,Alaska,272,8,66.0,264.0,8038.0,1344.711576,2.941176,45.504049,2020-04-12
2,Arizona,3542,115,,3427.0,42109.0,578.522286,3.246753,48.662422,2020-04-12
3,Arkansas,1280,27,367.0,1253.0,19722.0,761.753354,2.109375,49.439423,2020-04-12
4,California,22795,640,,22155.0,190328.0,485.423868,2.81202,58.137726,2020-04-12
5,Colorado,7307,289,,7018.0,34873.0,615.389991,3.955112,128.943729,2020-04-12
6,Connecticut,12035,554,,11481.0,41220.0,1156.148159,4.603241,337.560483,2020-04-12
7,Delaware,1625,35,191.0,1590.0,11103.0,1140.214672,2.153846,166.878217,2020-04-12
8,District of Columbia,1875,50,493.0,1825.0,10640.0,1507.618148,2.666667,265.67519,2020-04-12
9,Florida,19895,461,,19434.0,182753.0,860.718651,2.317165,93.700227,2020-04-12


In [48]:
# check viability of filling in with 0 in features
filled_recv = filter_df['recovered'].fillna(0)
filter_df.drop('active_off', axis=1, inplace=True)
filter_df['active_off'] = filter_df['confirmed'] - (filter_df['deaths']+filled_recv)
#filter_df[filter_df['active_off'] != filter_df['active']]
filter_df

Unnamed: 0,state,confirmed,deaths,recovered,active,people_tested,testing_rate,mortality_rate,incident_rate,date,active_off
0,Alabama,3563,93,,3470.0,21583.0,460.300152,2.610160,75.988020,2020-04-12,3470.0
1,Alaska,272,8,66.0,264.0,8038.0,1344.711576,2.941176,45.504049,2020-04-12,198.0
2,Arizona,3542,115,,3427.0,42109.0,578.522286,3.246753,48.662422,2020-04-12,3427.0
3,Arkansas,1280,27,367.0,1253.0,19722.0,761.753354,2.109375,49.439423,2020-04-12,886.0
4,California,22795,640,,22155.0,190328.0,485.423868,2.812020,58.137726,2020-04-12,22155.0
...,...,...,...,...,...,...,...,...,...,...,...
5595,Virginia,78375,2031,10107.0,66237.0,920461.0,10783.890236,2.591388,918.221845,2020-07-21,66237.0
5596,Washington,47743,1453,,46290.0,809339.0,10628.369959,3.043378,626.968757,2020-07-21,46290.0
5597,West Virginia,5084,100,3466.0,1518.0,234980.0,13111.647649,1.966955,283.682086,2020-07-21,1518.0
5598,Wisconsin,43018,846,33130.0,9042.0,783866.0,13462.857630,1.966619,738.831904,2020-07-21,9042.0


    Glimpsing over the difference between given 'active' and 
    'confirmed'-('deaths'+'recovered') results('active_off'), there are some states 
    that doesn't match up, indicating that residual active numbers are neither
    reported in 'deaths' or 'recovered'. 
    
    Although that may be something to consider, it could also be a miniscule aspect in 
    predicting y.
    
    Therefore, 'recovered' nulls can be filled with 0, since many of them match up with 
    the confirmed and active differences.

In [49]:
filter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           5600 non-null   object 
 1   confirmed       5600 non-null   int64  
 2   deaths          5600 non-null   int64  
 3   recovered       4327 non-null   float64
 4   active          5583 non-null   float64
 5   people_tested   5600 non-null   float64
 6   testing_rate    5600 non-null   float64
 7   mortality_rate  5500 non-null   float64
 8   incident_rate   5600 non-null   float64
 9   date            5590 non-null   object 
 10  active_off      5600 non-null   float64
dtypes: float64(7), int64(2), object(2)
memory usage: 481.4+ KB


In [57]:
# check where 600 is missing
print("State names with all nulls in three features: ")
print(df_reidx.loc[df_reidx['People_Tested'].isnull()==True, 'Province_State'].unique())
print(df_reidx.loc[df_reidx['Testing_Rate'].isnull()==True, 'Province_State'].unique())
print(df_reidx.loc[df_reidx['Incident_Rate'].isnull()==True, 'Province_State'].unique())

State names with all nulls in three features: 
['Diamond Princess' 'Grand Princess' 'Recovered']
['Diamond Princess' 'Grand Princess' 'Recovered']
['Diamond Princess' 'Grand Princess' 'Recovered']


In [60]:
df_reidx[df_reidx['Province_State']=='Diamond Princess']

Unnamed: 0_level_0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-12,Diamond Princess,49,0,49.0,,,
2020-04-13,Diamond Princess,49,0,49.0,,,
2020-04-14,Diamond Princess,49,0,49.0,,,
2020-04-15,Diamond Princess,49,0,49.0,,,
2020-04-16,Diamond Princess,49,0,49.0,,,
...,...,...,...,...,...,...,...
2020-07-17,Diamond Princess,49,0,49.0,,,
2020-07-18,Diamond Princess,49,0,49.0,,,
2020-07-19,Diamond Princess,49,0,49.0,,,
2020-07-20,Diamond Princess,49,0,49.0,,,


In [61]:
df_reidx[df_idx['Province_State']=='Grand Princess']

Unnamed: 0_level_0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-12,Grand Princess,103,0,103.0,,,
2020-04-13,Grand Princess,103,0,103.0,,,
2020-04-14,Grand Princess,103,0,103.0,,,
2020-04-15,Grand Princess,103,0,103.0,,,
2020-04-16,Grand Princess,103,0,103.0,,,
...,...,...,...,...,...,...,...
2020-07-17,Grand Princess,103,3,100.0,,,
2020-07-18,Grand Princess,103,3,100.0,,,
2020-07-19,Grand Princess,103,3,100.0,,,
2020-07-20,Grand Princess,103,3,100.0,,,


In [63]:
df_reidx.Confirmed.describe()

count      5818.000000
mean      32692.493469
std       60430.019162
min           0.000000
25%        2311.500000
50%       10975.000000
75%       35629.500000
max      407326.000000
Name: Confirmed, dtype: float64

    Comparing 'Confirmed' to the entire dataset, Diamond & Grand cruise ship circumstances
    may not be relevant to average numbers in US, so it may be feasible to extract them,
    as well as 'Recovered', since that is not a state.

In [88]:
df_bad = df_reidx[df_reidx['Province_State'].isin(['Diamond Princess','Grand Princess','Recovered'])]
df_bad.isnull().sum().sum()

663

In [8]:
clean_df = df_reidx[(df_reidx['Province_State']!='Diamond Princess') & (df_reidx['Province_State']!='Grand Princess')]
clean_df

Unnamed: 0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate,date
0,Alabama,3563,93,3470.0,21583.0,460.300152,75.988020,2020-04-12
1,Alaska,272,8,264.0,8038.0,1344.711576,45.504049,2020-04-12
2,Arizona,3542,115,3427.0,42109.0,578.522286,48.662422,2020-04-12
3,Arkansas,1280,27,1253.0,19722.0,761.753354,49.439423,2020-04-12
4,California,22795,640,22155.0,190328.0,485.423868,58.137726,2020-04-12
...,...,...,...,...,...,...,...,...
5813,Virginia,78375,2031,66237.0,920461.0,10783.890236,918.221845,2020-07-21
5814,Washington,47743,1453,46290.0,809339.0,10628.369959,626.968757,2020-07-21
5815,West Virginia,5084,100,1518.0,234980.0,13111.647649,283.682086,2020-07-21
5816,Wisconsin,43018,846,9042.0,783866.0,13462.857630,738.831904,2020-07-21


In [9]:
clean_df = clean_df[clean_df['Province_State'] != 'Recovered']
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5600 entries, 0 to 5817
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province_State  5600 non-null   object 
 1   Confirmed       5600 non-null   int64  
 2   Deaths          5600 non-null   int64  
 3   Active          5583 non-null   float64
 4   People_Tested   5600 non-null   float64
 5   Testing_Rate    5600 non-null   float64
 6   Incident_Rate   5600 non-null   float64
 7   date            5590 non-null   object 
dtypes: float64(4), int64(2), object(2)
memory usage: 393.8+ KB


In [12]:
clean_df.loc[clean_df['date'].isnull()]['Province_State'].value_counts()

American Samoa    9
Virgin Islands    1
Name: Province_State, dtype: int64

In [11]:
clean_df['date'].isnull().sum()

10

In [107]:
clean_df[clean_df['Province_State'] == 'American Samoa']

Unnamed: 0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate,date
55,American Samoa,0,0,0.0,3.0,5.391708,0.0,
61,American Samoa,0,0,,3.0,5.391708,0.0,
120,American Samoa,0,0,,3.0,5.391708,0.0,
179,American Samoa,0,0,,3.0,5.391708,0.0,
238,American Samoa,0,0,,3.0,5.391708,0.0,
...,...,...,...,...,...,...,...,...
5530,American Samoa,0,0,0.0,1037.0,1863.733578,0.0,2020-07-17
5588,American Samoa,0,0,0.0,1037.0,1863.733578,0.0,2020-07-18
5646,American Samoa,0,0,0.0,1037.0,1863.733578,0.0,2020-07-19
5704,American Samoa,0,0,0.0,1037.0,1863.733578,0.0,2020-07-20


In [108]:
clean_df[clean_df['Province_State'] == 'Virgin Islands']

Unnamed: 0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate,date
58,Virgin Islands,51,1,50.0,350.0,326.285565,47.544468,
111,Virgin Islands,51,1,50.0,373.0,347.727188,47.544468,2020-04-13
170,Virgin Islands,51,1,50.0,377.0,351.456166,47.544468,2020-04-14
229,Virgin Islands,51,1,50.0,382.0,356.117388,47.544468,2020-04-15
288,Virgin Islands,51,1,50.0,408.0,380.355744,47.544468,2020-04-16
...,...,...,...,...,...,...,...,...
5580,Virgin Islands,249,6,123.0,5285.0,4926.912033,232.128873,2020-07-17
5638,Virgin Islands,263,6,131.0,6053.0,5642.875788,245.180296,2020-07-18
5696,Virgin Islands,283,6,144.0,6522.0,6080.098445,263.825186,2020-07-19
5754,Virgin Islands,283,6,142.0,6884.0,6417.570944,263.825186,2020-07-20


In [14]:
# fill missing dates
clean_df['date'] = clean_df['date'].fillna(method='ffill')
print(clean_df['date'].isnull().sum())

0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [31]:
# check their dates is filled appropriately 
print(len(set(clean_df.loc[clean_df['Province_State'] == 'Virgin Islands', 'date'])))
print(len(set(clean_df.loc[clean_df['Province_State'] == 'American Samoa', 'date'])))

100
100


In [38]:
clean_df[(clean_df['Province_State']=='American Samoa') & (clean_df['Active'].isnull()==True)] = 0
print(clean_df['Active'].isnull().sum())

0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [41]:
clean_df

Unnamed: 0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate,date
0,Alabama,3563,93,3470.0,21583.0,460.300152,75.988020,2020-04-12
1,Alaska,272,8,264.0,8038.0,1344.711576,45.504049,2020-04-12
2,Arizona,3542,115,3427.0,42109.0,578.522286,48.662422,2020-04-12
3,Arkansas,1280,27,1253.0,19722.0,761.753354,49.439423,2020-04-12
4,California,22795,640,22155.0,190328.0,485.423868,58.137726,2020-04-12
...,...,...,...,...,...,...,...,...
5813,Virginia,78375,2031,66237.0,920461.0,10783.890236,918.221845,2020-07-21
5814,Washington,47743,1453,46290.0,809339.0,10628.369959,626.968757,2020-07-21
5815,West Virginia,5084,100,1518.0,234980.0,13111.647649,283.682086,2020-07-21
5816,Wisconsin,43018,846,9042.0,783866.0,13462.857630,738.831904,2020-07-21


# Assess data to determine which model to use

    - What may have to be done is get the mean for each X features for each day (that means outliers need to
      be assessed??)
    
    - We have to categorize the dataset into two groups based on safe-range (considering mean/iqr/z-scores). 
      0 for normal, 1 for risky. Not sure yet how to determine those categories. Perhaps, assess y value ranges
      as well as, I guess using linreg, feature values' interactions in predicting its corresponding y for that
      date?
      
    - after categorizing into two, proceed with log regression for more accurate prediction coef?
    
    - then ml optimization

In [None]:
# try z-scores on each day df for each 