In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# EDA
1. combine daily datasets into a dictionary to concat together as a whole df to manage as a whole.
<br><br>
2. extract only the necessary columns to be used in prediction
      'Province_State', 
      'Last_Update', 
      'Confirmed', 
      'Deaths', 
      'Recovered', 
      'Active', 
      'Incident_Rate',
      'Mortality_Rate',
      'People_Tested',
      'Testing_Rate',
      'Hospitalization_Rate'
      
      To reduce space and time, I will use columns which I feel are MOST important: 
      'Province_State', 'Last_Update', 'Confirmed', 'Deaths', 'Recovered', 'Active',
      'People_Tested', 'Testing_Rate', 'Incident_Rate'


In [2]:
import os

csv_list = []
# append csv file names into an empty list to be iterated through afterwards
for files in sorted(os.listdir("csse_covid19_daily_us/.")):
    if files.endswith(".csv"):
        csv_list.append(files)
        
# check if it appended correctly to csv_list
print(len(csv_list))
csv_list[:5]

100


['04-12-2020.csv',
 '04-13-2020.csv',
 '04-14-2020.csv',
 '04-15-2020.csv',
 '04-16-2020.csv']

In [70]:
# iterate through csv_list and read_csv > set dates as dict keys > daily data as vals
feature_list = ['Last_Update', 'Province_State', 'Confirmed', 'Deaths', 
                'Active', 'People_Tested', 'Testing_Rate', 'Incident_Rate']
 
# datetime_idx = pd.DatetimeIndex([filename[0:-4] for filename in csv_list])
daily_states_dict = {}

for i in range(len(csv_list)):
    csv_str = 'csse_covid19_daily_us/'+csv_list[i]
    today_df = pd.read_csv(csv_str, sep=',')
    filter_df = today_df[feature_list]
    daily_states_dict[i] = filter_df

if len(daily_states_dict) == 100:
    print("Appended all 100 days and corresponding dataframes.")
else:
    print("Loop didn't work correctly.")

Appended all 100 days and corresponding dataframes.


In [71]:
df = pd.concat(daily_states_dict, axis=0)
df

Unnamed: 0,Unnamed: 1,Last_Update,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate
0,0,2020-04-12 23:18:15,Alabama,3563,93,3470.0,21583.0,460.300152,75.988020
0,1,2020-04-12 23:18:15,Alaska,272,8,264.0,8038.0,1344.711576,45.504049
0,2,2020-04-12 23:18:15,Arizona,3542,115,3427.0,42109.0,578.522286,48.662422
0,3,2020-04-12 23:18:15,Arkansas,1280,27,1253.0,19722.0,761.753354,49.439423
0,4,2020-04-12 23:18:15,California,22795,640,22155.0,190328.0,485.423868,58.137726
...,...,...,...,...,...,...,...,...,...
99,53,2020-07-21 04:38:59,Virginia,78375,2031,66237.0,920461.0,10783.890236,918.221845
99,54,2020-07-21 04:38:59,Washington,47743,1453,46290.0,809339.0,10628.369959,626.968757
99,55,2020-07-21 04:38:59,West Virginia,5084,100,1518.0,234980.0,13111.647649,283.682086
99,56,2020-07-21 04:38:59,Wisconsin,43018,846,9042.0,783866.0,13462.857630,738.831904


In [74]:
# remove muti-indexing; later will groupby dates
df_reidx = df.reset_index(drop=True)
df_reidx

Unnamed: 0,Last_Update,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate
0,2020-04-12 23:18:15,Alabama,3563,93,3470.0,21583.0,460.300152,75.988020
1,2020-04-12 23:18:15,Alaska,272,8,264.0,8038.0,1344.711576,45.504049
2,2020-04-12 23:18:15,Arizona,3542,115,3427.0,42109.0,578.522286,48.662422
3,2020-04-12 23:18:15,Arkansas,1280,27,1253.0,19722.0,761.753354,49.439423
4,2020-04-12 23:18:15,California,22795,640,22155.0,190328.0,485.423868,58.137726
...,...,...,...,...,...,...,...,...
5813,2020-07-21 04:38:59,Virginia,78375,2031,66237.0,920461.0,10783.890236,918.221845
5814,2020-07-21 04:38:59,Washington,47743,1453,46290.0,809339.0,10628.369959,626.968757
5815,2020-07-21 04:38:59,West Virginia,5084,100,1518.0,234980.0,13111.647649,283.682086
5816,2020-07-21 04:38:59,Wisconsin,43018,846,9042.0,783866.0,13462.857630,738.831904


### Clean up the concatenated dataframe**


In [75]:
# first, reset index with dates by re-formatting date column and groupby dates
df_reidx['date'] = pd.to_datetime(df_reidx['Last_Update']).dt.strftime('%Y-%m-%d')
df_reidx.drop('Last_Update', axis=1, inplace=True)
df_reidx
#df.set_index('date', inplace=True) doesn't groupby them, but sets date as index

Unnamed: 0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate,date
0,Alabama,3563,93,3470.0,21583.0,460.300152,75.988020,2020-04-12
1,Alaska,272,8,264.0,8038.0,1344.711576,45.504049,2020-04-12
2,Arizona,3542,115,3427.0,42109.0,578.522286,48.662422,2020-04-12
3,Arkansas,1280,27,1253.0,19722.0,761.753354,49.439423,2020-04-12
4,California,22795,640,22155.0,190328.0,485.423868,58.137726,2020-04-12
...,...,...,...,...,...,...,...,...
5813,Virginia,78375,2031,66237.0,920461.0,10783.890236,918.221845,2020-07-21
5814,Washington,47743,1453,46290.0,809339.0,10628.369959,626.968757,2020-07-21
5815,West Virginia,5084,100,1518.0,234980.0,13111.647649,283.682086,2020-07-21
5816,Wisconsin,43018,846,9042.0,783866.0,13462.857630,738.831904,2020-07-21


self-notes: detect outliers and cleaning up: plot methods <br>
https://towardsdatascience.com/ways-to-detect-and-remove-the-outliers-404d16608dba
    
    sns.boxplot(x=df['x'])
    
    fig, ax = plt.subplots(figsize=(16,8))
    ax.scatter(df['var1'], df['var2'])
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    plt.show()
    
    z = np.abs(stats.zscore(df))
    zthreshold = 3
    print(np.where(z > 3))
    print(z[int_arr1][int_arr2]) returns z-score
        to remove: df_z_out = df[(z < 3).all(axis=1)]
    
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    print(IQR) to see the iqr for each column
    print(df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR)) returns 
    true or false
        To remove: df_qt_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

##### Manage missing values

In [53]:
# check count and dropna where most values are null
print(df_reidx.info(), '\n')
print("number of NaNs: ", df_reidx.isnull().sum().sum())


<class 'pandas.core.frame.DataFrame'>
Index: 5818 entries, 2020-04-12 to 2020-07-21
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province_State  5818 non-null   object 
 1   Confirmed       5818 non-null   int64  
 2   Deaths          5818 non-null   int64  
 3   Active          5801 non-null   float64
 4   People_Tested   5600 non-null   float64
 5   Testing_Rate    5600 non-null   float64
 6   Incident_Rate   5600 non-null   float64
dtypes: float64(4), int64(2), object(1)
memory usage: 363.6+ KB
None 

number of NaNs:  671


    There are some states that are missing People_Tested, Testing_Rate, and Incident_rate.
    There are 17 missing in Active, which can be filled with 0

In [57]:
# check where 600 is missing
print("State names with all nulls in three features: ")
print(df_reidx.loc[df_reidx['People_Tested'].isnull()==True, 'Province_State'].unique())
print(df_reidx.loc[df_reidx['Testing_Rate'].isnull()==True, 'Province_State'].unique())
print(df_reidx.loc[df_reidx['Incident_Rate'].isnull()==True, 'Province_State'].unique())

State names with all nulls in three features: 
['Diamond Princess' 'Grand Princess' 'Recovered']
['Diamond Princess' 'Grand Princess' 'Recovered']
['Diamond Princess' 'Grand Princess' 'Recovered']


In [60]:
df_reidx[df_reidx['Province_State']=='Diamond Princess']

Unnamed: 0_level_0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-12,Diamond Princess,49,0,49.0,,,
2020-04-13,Diamond Princess,49,0,49.0,,,
2020-04-14,Diamond Princess,49,0,49.0,,,
2020-04-15,Diamond Princess,49,0,49.0,,,
2020-04-16,Diamond Princess,49,0,49.0,,,
...,...,...,...,...,...,...,...
2020-07-17,Diamond Princess,49,0,49.0,,,
2020-07-18,Diamond Princess,49,0,49.0,,,
2020-07-19,Diamond Princess,49,0,49.0,,,
2020-07-20,Diamond Princess,49,0,49.0,,,


In [61]:
df_reidx[df_idx['Province_State']=='Grand Princess']

Unnamed: 0_level_0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-04-12,Grand Princess,103,0,103.0,,,
2020-04-13,Grand Princess,103,0,103.0,,,
2020-04-14,Grand Princess,103,0,103.0,,,
2020-04-15,Grand Princess,103,0,103.0,,,
2020-04-16,Grand Princess,103,0,103.0,,,
...,...,...,...,...,...,...,...
2020-07-17,Grand Princess,103,3,100.0,,,
2020-07-18,Grand Princess,103,3,100.0,,,
2020-07-19,Grand Princess,103,3,100.0,,,
2020-07-20,Grand Princess,103,3,100.0,,,


In [63]:
df_reidx.Confirmed.describe()

count      5818.000000
mean      32692.493469
std       60430.019162
min           0.000000
25%        2311.500000
50%       10975.000000
75%       35629.500000
max      407326.000000
Name: Confirmed, dtype: float64

    Comparing 'Confirmed' to the entire dataset, Diamond & Grand cruise ship circumstances
    may not be relevant to average numbers in US, so it may be feasible to extract them,
    as well as 'Recovered', since that is not a state.

In [88]:
df_bad = df_reidx[df_reidx['Province_State'].isin(['Diamond Princess','Grand Princess','Recovered'])]
df_bad.isnull().sum().sum()

663

In [101]:
clean_df = df_reidx[(df_reidx['Province_State']!='Diamond Princess') & (df_reidx['Province_State']!='Grand Princess')]
clean_df

Unnamed: 0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate,date
0,Alabama,3563,93,3470.0,21583.0,460.300152,75.988020,2020-04-12
1,Alaska,272,8,264.0,8038.0,1344.711576,45.504049,2020-04-12
2,Arizona,3542,115,3427.0,42109.0,578.522286,48.662422,2020-04-12
3,Arkansas,1280,27,1253.0,19722.0,761.753354,49.439423,2020-04-12
4,California,22795,640,22155.0,190328.0,485.423868,58.137726,2020-04-12
...,...,...,...,...,...,...,...,...
5813,Virginia,78375,2031,66237.0,920461.0,10783.890236,918.221845,2020-07-21
5814,Washington,47743,1453,46290.0,809339.0,10628.369959,626.968757,2020-07-21
5815,West Virginia,5084,100,1518.0,234980.0,13111.647649,283.682086,2020-07-21
5816,Wisconsin,43018,846,9042.0,783866.0,13462.857630,738.831904,2020-07-21


In [103]:
clean_df = clean_df[clean_df['Province_State'] != 'Recovered']
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5600 entries, 0 to 5817
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Province_State  5600 non-null   object 
 1   Confirmed       5600 non-null   int64  
 2   Deaths          5600 non-null   int64  
 3   Active          5583 non-null   float64
 4   People_Tested   5600 non-null   float64
 5   Testing_Rate    5600 non-null   float64
 6   Incident_Rate   5600 non-null   float64
 7   date            5590 non-null   object 
dtypes: float64(4), int64(2), object(2)
memory usage: 393.8+ KB


In [105]:
clean_df.loc[clean_df['date'].isnull()]

Unnamed: 0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate,date
55,American Samoa,0,0,0.0,3.0,5.391708,0.0,
58,Virgin Islands,51,1,50.0,350.0,326.285565,47.544468,
61,American Samoa,0,0,,3.0,5.391708,0.0,
120,American Samoa,0,0,,3.0,5.391708,0.0,
179,American Samoa,0,0,,3.0,5.391708,0.0,
238,American Samoa,0,0,,3.0,5.391708,0.0,
297,American Samoa,0,0,,3.0,5.391708,0.0,
356,American Samoa,0,0,,3.0,5.391708,0.0,
415,American Samoa,0,0,,3.0,5.391708,0.0,
474,American Samoa,0,0,,3.0,5.391708,0.0,


In [107]:
clean_df[clean_df['Province_State'] == 'American Samoa']

Unnamed: 0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate,date
55,American Samoa,0,0,0.0,3.0,5.391708,0.0,
61,American Samoa,0,0,,3.0,5.391708,0.0,
120,American Samoa,0,0,,3.0,5.391708,0.0,
179,American Samoa,0,0,,3.0,5.391708,0.0,
238,American Samoa,0,0,,3.0,5.391708,0.0,
...,...,...,...,...,...,...,...,...
5530,American Samoa,0,0,0.0,1037.0,1863.733578,0.0,2020-07-17
5588,American Samoa,0,0,0.0,1037.0,1863.733578,0.0,2020-07-18
5646,American Samoa,0,0,0.0,1037.0,1863.733578,0.0,2020-07-19
5704,American Samoa,0,0,0.0,1037.0,1863.733578,0.0,2020-07-20


In [108]:
clean_df[clean_df['Province_State'] == 'Virgin Islands']

Unnamed: 0,Province_State,Confirmed,Deaths,Active,People_Tested,Testing_Rate,Incident_Rate,date
58,Virgin Islands,51,1,50.0,350.0,326.285565,47.544468,
111,Virgin Islands,51,1,50.0,373.0,347.727188,47.544468,2020-04-13
170,Virgin Islands,51,1,50.0,377.0,351.456166,47.544468,2020-04-14
229,Virgin Islands,51,1,50.0,382.0,356.117388,47.544468,2020-04-15
288,Virgin Islands,51,1,50.0,408.0,380.355744,47.544468,2020-04-16
...,...,...,...,...,...,...,...,...
5580,Virgin Islands,249,6,123.0,5285.0,4926.912033,232.128873,2020-07-17
5638,Virgin Islands,263,6,131.0,6053.0,5642.875788,245.180296,2020-07-18
5696,Virgin Islands,283,6,144.0,6522.0,6080.098445,263.825186,2020-07-19
5754,Virgin Islands,283,6,142.0,6884.0,6417.570944,263.825186,2020-07-20


# STOPPED HERE: NEXT, FILL IN NANS FOR VIRGIN ISL, AM SAMOA(ZEROS)
Then, detect and separate outliers.