In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# EDA
1. combine daily datasets into a dictionary to concat together as a whole df to manage as a whole.
<br><br>
2. extract only the necessary columns to be used in prediction
      'Province_State', 
      'Last_Update', 
      'Confirmed', 
      'Deaths', 
      'Recovered', 
      'Active', 
      'Incident_Rate',
      'Mortality_Rate',
      'People_Tested',
      'Testing_Rate',
      'Hospitalization_Rate'
      
      To reduce space and time, I will use columns which I feel are MOST important: 
      'Province_State', 'Last_Update', 'Confirmed', 'Deaths', 'Recovered', 'Active',
      'People_Tested', 'Testing_Rate', 'Mortality_Rate', 'Incident_Rate'


In [2]:
import os

csv_list = []
# append csv file names into an empty list to be iterated through afterwards
for files in sorted(os.listdir("csse_covid19_daily_us/.")):
    if files.endswith(".csv"):
        csv_list.append(files)
        
# check if it appended correctly to csv_list
print(len(csv_list))
csv_list[:5]

100


['04-12-2020.csv',
 '04-13-2020.csv',
 '04-14-2020.csv',
 '04-15-2020.csv',
 '04-16-2020.csv']

In [4]:
# iterate through csv_list and read_csv > set dates as dict keys > daily data as vals
feature_list = ['Province_State', 'Last_Update', 'Confirmed', 'Deaths', 
                'Recovered', 'Active', 'People_Tested', 'Testing_Rate', 
                'Mortality_Rate', 'Incident_Rate']
 
# datetime_idx = pd.DatetimeIndex([filename[0:-4] for filename in csv_list])
daily_states_dict = {}

for i in range(len(csv_list)):
    csv_str = 'csse_covid19_daily_us/'+csv_list[i]
    today_df = pd.read_csv(csv_str, sep=',')
    filter_df = today_df[feature_list]
    daily_states_dict[i] = filter_df

if len(daily_states_dict) == 100:
    print("Appended all 100 days and corresponding dataframes.")
else:
    print("Loop didn't work correctly.")

Appended all 100 days and corresponding dataframes.


# Data pre-processing

In [5]:
# combine dictionary items into a whole df
df = pd.concat(daily_states_dict, axis=0)

# first, reset index with dates by re-formatting date column and groupby dates
df['date'] = pd.to_datetime(df['Last_Update']).dt.strftime('%Y-%m-%d')
df.drop('Last_Update', axis=1, inplace=True)
#df.set_index('date', inplace=True) doesn't groupby them, but sets date as index
df

Unnamed: 0,Unnamed: 1,Province_State,Confirmed,Deaths,Recovered,Active,People_Tested,Testing_Rate,Mortality_Rate,Incident_Rate,date
0,0,Alabama,3563,93,,3470.0,21583.0,460.300152,2.610160,75.988020,2020-04-12
0,1,Alaska,272,8,66.0,264.0,8038.0,1344.711576,2.941176,45.504049,2020-04-12
0,2,Arizona,3542,115,,3427.0,42109.0,578.522286,3.246753,48.662422,2020-04-12
0,3,Arkansas,1280,27,367.0,1253.0,19722.0,761.753354,2.109375,49.439423,2020-04-12
0,4,California,22795,640,,22155.0,190328.0,485.423868,2.812020,58.137726,2020-04-12
...,...,...,...,...,...,...,...,...,...,...,...
99,53,Virginia,78375,2031,10107.0,66237.0,920461.0,10783.890236,2.591388,918.221845,2020-07-21
99,54,Washington,47743,1453,,46290.0,809339.0,10628.369959,3.043378,626.968757,2020-07-21
99,55,West Virginia,5084,100,3466.0,1518.0,234980.0,13111.647649,1.966955,283.682086,2020-07-21
99,56,Wisconsin,43018,846,33130.0,9042.0,783866.0,13462.857630,1.966619,738.831904,2020-07-21


In [6]:
# check len(Last_Update) are all equal
valct = df.groupby('Province_State')['date'].value_counts()

valct[valct > 1]

Province_State  date      
Florida         2020-04-14    2
Name: date, dtype: int64

    Florida has duplicated date on the 14th of April. This will affect our analysis

In [7]:
# go back to initial, pre-processed df
df[df.Province_State == 'Florida'].head()

Unnamed: 0,Unnamed: 1,Province_State,Confirmed,Deaths,Recovered,Active,People_Tested,Testing_Rate,Mortality_Rate,Incident_Rate,date
0,10,Florida,19895,461,,19434.0,182753.0,860.718651,2.317165,93.700227,2020-04-12
1,11,Florida,21019,499,,122520.0,196207.0,924.083459,0.405628,579.387193,2020-04-14
2,11,Florida,21628,571,,21057.0,203180.0,956.924459,2.640096,101.862202,2020-04-14
3,11,Florida,22511,596,,21915.0,213509.0,1005.571337,2.647595,106.0209,2020-04-15
4,11,Florida,23343,668,,22675.0,219248.0,1032.60052,2.861672,109.939402,2020-04-16


* It can be assumed that the update for April 13th was posted slightly past midnight (based on the exact 'ns' format from original data), which led to being considered as next day. However, the column values for the 13th, except for 'confirmed' and one or two others, seems significantly different from its previous day and the next days. 
    
* I decided that I will move the midnight data to the 13th, and alter the values for sigficantly different values to Florida's avg for that corresponding week. 

In [60]:

# 1. locate index value
df_ = df.reset_index(drop=True)
df_[(df_.Province_State=='Florida') & (df_.date == '2020-04-14')]

# 2. change date value to 13th
df_.loc[df_.index==70, 'date'] = '2020-04-13'

# 3. get first week (12-19th, excl. 13th) avg for active, mortality and incident rates
wk1_fl = df_[(df_.Province_State == 'Florida') & (df_.index != 70)].reset_index(drop=True)
wk1_fl_sub = wk1_fl.loc[:6, ['Active','Mortality_Rate','Incident_Rate']].agg(np.mean)
wk1_fl_sub

# 4. fill in the incorrect column vals with corresponding series vals
df_.loc[df_.index==70, 'Active'] = wk1_fl_sub[0]
df_.loc[df_.index==70, 'Mortality_Rate'] = wk1_fl_sub[1]
df_.loc[df_.index==70, 'Incident_Rate'] = wk1_fl_sub[2]

# check result
df_.loc[63:68]
df_[df_.Province_State == 'Florida'].head()

Unnamed: 0,Province_State,Confirmed,Deaths,Recovered,Active,People_Tested,Testing_Rate,Mortality_Rate,Incident_Rate,date
10,Florida,19895,461,,19434.0,182753.0,860.718651,2.317165,93.700227,2020-04-12
70,Florida,21019,499,,22771.285714,196207.0,924.083459,2.752916,110.303397,2020-04-13
129,Florida,21628,571,,21057.0,203180.0,956.924459,2.640096,101.862202,2020-04-14
188,Florida,22511,596,,21915.0,213509.0,1005.571337,2.647595,106.0209,2020-04-15
247,Florida,23343,668,,22675.0,219248.0,1032.60052,2.861672,109.939402,2020-04-16


##### Manage missing values
    - for dates, fillna(method='ffill') - depending on row placement of the state within the df

    - other feature values: confirmed, deaths, active fill with 0
    
    - if there are nulls in other rest feature vals, look over and fill with 0

In [61]:
# check count and dropna where most values are null
print(df_.info(), '\n')
print("number of NaNs: ", df_.isnull().sum().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5818 entries, 0 to 5817
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province_State  5818 non-null   object 
 1   Confirmed       5818 non-null   int64  
 2   Deaths          5818 non-null   int64  
 3   Recovered       4391 non-null   float64
 4   Active          5801 non-null   float64
 5   People_Tested   5600 non-null   float64
 6   Testing_Rate    5600 non-null   float64
 7   Mortality_Rate  5700 non-null   float64
 8   Incident_Rate   5600 non-null   float64
 9   date            5799 non-null   object 
dtypes: float64(6), int64(2), object(2)
memory usage: 454.7+ KB
None 

number of NaNs:  2235


In [62]:
# first, change to easy access names
df_.columns = df_.columns.str.lower()
df_.rename(columns={'province_state': 'state'}, inplace=True)


In [63]:
print(df_.state.unique())

['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'Diamond Princess' 'District of Columbia'
 'Florida' 'Georgia' 'Grand Princess' 'Guam' 'Hawaii' 'Idaho' 'Illinois'
 'Indiana' 'Iowa' 'Kansas' 'Kentucky' 'Louisiana' 'Maine' 'Maryland'
 'Massachusetts' 'Michigan' 'Minnesota' 'Mississippi' 'Missouri' 'Montana'
 'Nebraska' 'Nevada' 'New Hampshire' 'New Jersey' 'New Mexico' 'New York'
 'North Carolina' 'North Dakota' 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania'
 'Puerto Rico' 'Rhode Island' 'South Carolina' 'South Dakota' 'Tennessee'
 'Texas' 'Utah' 'Vermont' 'Virginia' 'Washington' 'West Virginia'
 'Wisconsin' 'Wyoming' 'American Samoa' 'Northern Mariana Islands'
 'Recovered' 'Virgin Islands']


In [64]:
# assess value stat and compare to entire data
print("GRAND PRINCESS: \n\n{}".format(df_[df_['state']=='Grand Princess'].agg(np.mean)), '\n\n')

print("DIAMOND PRINCESS: \n\n{}".format(df_[df_['state']=='Diamond Princess'].agg(np.mean)), '\n')


GRAND PRINCESS: 

confirmed         103.000000
deaths              2.670000
recovered           0.000000
active            100.330000
people_tested            NaN
testing_rate             NaN
mortality_rate      2.592233
incident_rate            NaN
dtype: float64 


DIAMOND PRINCESS: 

confirmed         49.0
deaths             0.0
recovered          0.0
active            49.0
people_tested      NaN
testing_rate       NaN
mortality_rate     0.0
incident_rate      NaN
dtype: float64 



In [65]:
df_.describe()

Unnamed: 0,confirmed,deaths,recovered,active,people_tested,testing_rate,mortality_rate,incident_rate
count,5818.0,5818.0,4391.0,5801.0,5600.0,5600.0,5700.0,5600.0
mean,32692.493469,1668.792712,11057.379868,22955.531854,348947.1,5864.633381,4.047353,481.6149
std,60430.019162,4109.291674,18960.665192,49303.290427,620874.5,4634.567964,2.858028,460.975763
min,0.0,0.0,0.0,-120720.0,3.0,5.391708,0.0,0.0
25%,2311.5,67.0,724.5,842.0,48896.0,2196.716493,2.390435,139.496918
50%,10975.0,368.5,3157.0,6258.0,143347.0,4735.449294,3.780917,317.451194
75%,35629.5,1422.75,11837.0,20340.0,389752.0,8440.463156,5.151574,673.463513
max,407326.0,32506.0,177871.0,390100.0,6414321.0,26549.442542,70.37037,2093.83773


    Comparing feature means to the entire dataset, Diamond & Grand cruise ship 
    circumstances may not be relevant to average numbers in US, so it may be 
    feasible to extract them, as well as 'Recovered', since that is not a state.

In [66]:
# drop cruise ships & other not relevant

filterdf = df_.set_index('state')
filterdf = filterdf.drop(['Grand Princess', 'Diamond Princess', 'Recovered'], axis=0)
filterdf.reset_index(inplace=True)
filterdf.head()

Unnamed: 0,state,confirmed,deaths,recovered,active,people_tested,testing_rate,mortality_rate,incident_rate,date
0,Alabama,3563,93,,3470.0,21583.0,460.300152,2.61016,75.98802,2020-04-12
1,Alaska,272,8,66.0,264.0,8038.0,1344.711576,2.941176,45.504049,2020-04-12
2,Arizona,3542,115,,3427.0,42109.0,578.522286,3.246753,48.662422,2020-04-12
3,Arkansas,1280,27,367.0,1253.0,19722.0,761.753354,2.109375,49.439423,2020-04-12
4,California,22795,640,,22155.0,190328.0,485.423868,2.81202,58.137726,2020-04-12


    Glimpsing over the difference between given 'active' and 
    'confirmed'-('deaths'+'recovered') results('active_off'), there are some states 
    that doesn't match up, indicating that residual active numbers are neither
    reported in 'deaths' or 'recovered'. 
    
    Although that may be something to consider, it could also be a miniscule aspect in 
    predicting y.
    
    Therefore, 'recovered' nulls can be filled with 0, since many of them match up with 
    the confirmed and active differences.

In [67]:
clean_df = filterdf.copy()
clean_df['recovered'] = filterdf['recovered'].fillna(0)
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           5600 non-null   object 
 1   confirmed       5600 non-null   int64  
 2   deaths          5600 non-null   int64  
 3   recovered       5600 non-null   float64
 4   active          5583 non-null   float64
 5   people_tested   5600 non-null   float64
 6   testing_rate    5600 non-null   float64
 7   mortality_rate  5500 non-null   float64
 8   incident_rate   5600 non-null   float64
 9   date            5590 non-null   object 
dtypes: float64(6), int64(2), object(2)
memory usage: 437.6+ KB


In [68]:
# check 'active' and fill with 0
clean_df[clean_df['active'].isnull()==True]['state'].unique()

array(['American Samoa'], dtype=object)

    Only American Samoa is missing values in active column.

In [69]:
clean_df[clean_df['state']=='American Samoa'].head()

Unnamed: 0,state,confirmed,deaths,recovered,active,people_tested,testing_rate,mortality_rate,incident_rate,date
53,American Samoa,0,0,0.0,0.0,3.0,5.391708,,0.0,
58,American Samoa,0,0,0.0,,3.0,5.391708,,0.0,
114,American Samoa,0,0,0.0,,3.0,5.391708,,0.0,
170,American Samoa,0,0,0.0,,3.0,5.391708,,0.0,
226,American Samoa,0,0,0.0,,3.0,5.391708,,0.0,


    The state has barely any reports (which is a good thing).
    Fill all of its nulls across all missing cols with 0 except for date.

In [70]:
clean_df['active'].fillna(0, inplace=True)
clean_df['mortality_rate'].fillna(0, inplace=True)
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           5600 non-null   object 
 1   confirmed       5600 non-null   int64  
 2   deaths          5600 non-null   int64  
 3   recovered       5600 non-null   float64
 4   active          5600 non-null   float64
 5   people_tested   5600 non-null   float64
 6   testing_rate    5600 non-null   float64
 7   mortality_rate  5600 non-null   float64
 8   incident_rate   5600 non-null   float64
 9   date            5590 non-null   object 
dtypes: float64(6), int64(2), object(2)
memory usage: 437.6+ KB


In [71]:
# check states with missing dates and fill with forward fills
print(clean_df.loc[clean_df['date'].isnull()==True, 'state'].unique())

print("missing date count: ", clean_df['date'].isnull().sum())


['American Samoa' 'Virgin Islands']
missing date count:  10


In [74]:
# check if filled in
clean_df['date'] = clean_df['date'].fillna(method='ffill')

print("missing date count: {}\n".format(clean_df['date'].isnull().sum()))
clean_df.info()

missing date count: 0

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           5600 non-null   object 
 1   confirmed       5600 non-null   int64  
 2   deaths          5600 non-null   int64  
 3   recovered       5600 non-null   float64
 4   active          5600 non-null   float64
 5   people_tested   5600 non-null   float64
 6   testing_rate    5600 non-null   float64
 7   mortality_rate  5600 non-null   float64
 8   incident_rate   5600 non-null   float64
 9   date            5600 non-null   object 
dtypes: float64(6), int64(2), object(2)
memory usage: 437.6+ KB


In [75]:
# for some reason, 'date' is no longer datetime, so convert again
clean_df['date'] = pd.to_datetime(clean_df['date'])
clean_df.head()

Unnamed: 0,state,confirmed,deaths,recovered,active,people_tested,testing_rate,mortality_rate,incident_rate,date
0,Alabama,3563,93,0.0,3470.0,21583.0,460.300152,2.61016,75.98802,2020-04-12
1,Alaska,272,8,66.0,264.0,8038.0,1344.711576,2.941176,45.504049,2020-04-12
2,Arizona,3542,115,0.0,3427.0,42109.0,578.522286,3.246753,48.662422,2020-04-12
3,Arkansas,1280,27,367.0,1253.0,19722.0,761.753354,2.109375,49.439423,2020-04-12
4,California,22795,640,0.0,22155.0,190328.0,485.423868,2.81202,58.137726,2020-04-12


In [76]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5600 entries, 0 to 5599
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   state           5600 non-null   object        
 1   confirmed       5600 non-null   int64         
 2   deaths          5600 non-null   int64         
 3   recovered       5600 non-null   float64       
 4   active          5600 non-null   float64       
 5   people_tested   5600 non-null   float64       
 6   testing_rate    5600 non-null   float64       
 7   mortality_rate  5600 non-null   float64       
 8   incident_rate   5600 non-null   float64       
 9   date            5600 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(6), int64(2), object(1)
memory usage: 437.6+ KB


# Transform feature and y values (daily value diff):

1. Get daily diff for feature and target values<br><br>
    
    - aggregate each day's columns across all states. 
    - get a "safe" range and assess outliers to determine safe/risky categories
    
    * IGNORE HERE: first thought was to consider state by state, since each state has differences in volume. However, it seems to be a bit more complicated, so I am thinking to proceed with comprehensive measure across dates
    
        - get diff first for each state by agg each features across entire timeline
        try out getting daily diff for each state's mortality/incident proportion in order to compare across states
        - compare the aggregated mean of diff of each state across entire timeline state by state
        - columns as dates
        - pandas df.diff(axis=1)
            * params: 
                  periods=n (n as integer; getting diff with n prior row/col)
    

2. Assess outliers (using iqr) and group into risky/normal


* ultimately, I want to end up with a ts df where 
    columns as: dates
    groupby states with rows of daily diff of each features 
    
    - 

In [16]:
clean_df.head()

Unnamed: 0,state,confirmed,deaths,recovered,active,people_tested,testing_rate,mortality_rate,incident_rate,date
0,Alabama,3563,93,0.0,3470.0,21583.0,460.300152,2.61016,75.98802,2020-04-12
1,Alaska,272,8,66.0,264.0,8038.0,1344.711576,2.941176,45.504049,2020-04-12
2,Arizona,3542,115,0.0,3427.0,42109.0,578.522286,3.246753,48.662422,2020-04-12
3,Arkansas,1280,27,367.0,1253.0,19722.0,761.753354,2.109375,49.439423,2020-04-12
4,California,22795,640,0.0,22155.0,190328.0,485.423868,2.81202,58.137726,2020-04-12


In [77]:
dailydff = {}

for state, dfs in clean_df.groupby('state'):
    dfs_ = dfs.loc[:,['date','confirmed','deaths',
                      'recovered','active','incident_rate',
                      'people_tested','testing_rate']].set_index('date')
    for cols in dfs_.columns:
        #col_name = str(cols)
        dfs_[str(cols)+"_diff"] = dfs_[cols].diff()
         
    if state not in dailydff.keys():
        dailydff[state] = dfs_[[col for col in dfs_ if "_diff" in str(col)]].dropna(how='all',
                                                                                    axis=0).reset_index()
dailydff

{'Alabama':          date  confirmed_diff  deaths_diff  recovered_diff  active_diff  \
 0  2020-04-13           171.0          6.0             0.0        165.0   
 1  2020-04-14           219.0         15.0             0.0        204.0   
 2  2020-04-15           122.0          4.0             0.0        118.0   
 3  2020-04-16           270.0         15.0             0.0        255.0   
 4  2020-04-17           226.0         15.0             0.0        211.0   
 ..        ...             ...          ...             ...          ...   
 94 2020-07-17          2021.0         19.0             0.0       2002.0   
 95 2020-07-18          2003.0         35.0             0.0       1968.0   
 96 2020-07-19          2143.0         21.0             0.0       2122.0   
 97 2020-07-20          1777.0          1.0             0.0       1776.0   
 98 2020-07-21          1880.0          4.0             0.0       1876.0   
 
     incident_rate_diff  people_tested_diff  testing_rate_diff  
 0        

In [78]:
# concatenate dictionary dfs and prep to aggregate
bystate_daily = pd.concat(dailydff, axis=0)
bystate_daily.reset_index(level=0, inplace=True)
bystate_daily.rename(columns={'level_0':'state'}, inplace=True)
print(bystate_daily.info())
bystate_daily

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5544 entries, 0 to 98
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   state               5544 non-null   object        
 1   date                5544 non-null   datetime64[ns]
 2   confirmed_diff      5544 non-null   float64       
 3   deaths_diff         5544 non-null   float64       
 4   recovered_diff      5544 non-null   float64       
 5   active_diff         5544 non-null   float64       
 6   incident_rate_diff  5544 non-null   float64       
 7   people_tested_diff  5544 non-null   float64       
 8   testing_rate_diff   5544 non-null   float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 433.1+ KB
None


Unnamed: 0,state,date,confirmed_diff,deaths_diff,recovered_diff,active_diff,incident_rate_diff,people_tested_diff,testing_rate_diff
0,Alabama,2020-04-13,171.0,6.0,0.0,165.0,3.646913,7599.0,162.063701
1,Alabama,2020-04-14,219.0,15.0,0.0,204.0,4.670608,3935.0,83.921656
2,Alabama,2020-04-15,122.0,4.0,0.0,118.0,2.601891,960.0,20.473898
3,Alabama,2020-04-16,270.0,15.0,0.0,255.0,5.758284,2314.0,49.350626
4,Alabama,2020-04-17,226.0,15.0,0.0,211.0,4.819897,1457.0,31.073406
...,...,...,...,...,...,...,...,...,...
94,Wyoming,2020-07-17,41.0,2.0,34.0,5.0,7.084123,538.0,92.957518
95,Wyoming,2020-07-18,43.0,0.0,50.0,-7.0,7.429690,544.0,93.994219
96,Wyoming,2020-07-19,39.0,0.0,21.0,18.0,6.738556,39.0,6.738556
97,Wyoming,2020-07-20,18.0,1.0,4.0,13.0,3.110103,18.0,3.110103


In [79]:
# assess by date 
bydate_diff_avg = bystate_daily.groupby('date')#[['confirmed_diff','incident_rate_diff']].agg(np.mean)
bydate_diff_avg.confirmed_diff.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-04-13,56.0,451.892857,1004.239806,0.0,20.50,100.5,423.50,6716.0
2020-04-14,56.0,483.053571,1134.820089,-63.0,35.25,135.5,451.75,7271.0
2020-04-15,56.0,512.142857,1561.988772,0.0,21.25,118.0,415.00,11434.0
2020-04-16,56.0,561.625000,1366.212529,0.0,26.00,149.5,581.25,9237.0
2020-04-17,56.0,569.732143,1078.936286,-112.0,43.00,173.5,621.00,6906.0
...,...,...,...,...,...,...,...,...
2020-07-17,56.0,1398.392857,3020.512874,0.0,92.75,615.5,1236.75,16017.0
2020-07-18,56.0,1277.821429,2432.556083,0.0,116.75,618.5,1011.00,11914.0
2020-07-19,56.0,1137.464286,2104.949241,0.0,77.50,450.5,969.00,10328.0
2020-07-20,56.0,1104.410714,2119.456155,-31.0,65.75,516.0,968.00,12478.0


# Assess outliers; Categorization

##### Steps:

1. Get low, high IQR bounds for each date for appropriate variables <br><br>

2. Filter out and create two new df: out-of-bounds states, normal range states <br><br>

3. using the filtered "normal" range dataset, aggregate mean for the variables (another df)<br><br>

4. with aggregated data, compute multi-linreg to select the model<br><br>

5. ts train-test split<br><br>

# updated 07-26-2020 12:53am
- proceed with steps above