In [1]:
import pandas as pd
import datetime as dt


In [2]:
#import csv and apply the parse_dates parameter to parse columns as datetime. This way we can later use the dates in more meaningful analysis.

dataset_2 = "Data\sars_2003_complete_dataset_clean.csv"
sars_df = pd.read_csv(dataset_2, parse_dates = ["Date"])
sars_df.head()

Unnamed: 0,Date,Country,Cumulative number of case(s),Number of deaths,Number recovered
0,2003-03-17,Germany,1,0,0
1,2003-03-17,Canada,8,2,0
2,2003-03-17,Singapore,20,0,0
3,2003-03-17,"Hong Kong SAR, China",95,1,0
4,2003-03-17,Switzerland,2,0,0


In [3]:
#renaming the columns to ensure consistency when the data is loaded into a database. We found these metrics to be most relevant for this dataset. 

sars_df.columns = ['Date', 'Country', 'Confirmed_Cases', 'Deaths', 'Recoveries']
sars_df.head()

Unnamed: 0,Date,Country,Confirmed_Cases,Deaths,Recoveries
0,2003-03-17,Germany,1,0,0
1,2003-03-17,Canada,8,2,0
2,2003-03-17,Singapore,20,0,0
3,2003-03-17,"Hong Kong SAR, China",95,1,0
4,2003-03-17,Switzerland,2,0,0


In [4]:
#there will be multiple different virus types that will be loaded into the database so adding the virus type as a new column

sars_df['Virus_Name'] = "Sars"
sars_df.head()

Unnamed: 0,Date,Country,Confirmed_Cases,Deaths,Recoveries,Virus_Name
0,2003-03-17,Germany,1,0,0,Sars
1,2003-03-17,Canada,8,2,0,Sars
2,2003-03-17,Singapore,20,0,0,Sars
3,2003-03-17,"Hong Kong SAR, China",95,1,0,Sars
4,2003-03-17,Switzerland,2,0,0,Sars


In [5]:
#because there are multiple date formats in each data, we decided to seperate it into year/month/date to keep it consistent across the board 

sars_df['Year'] = pd.DatetimeIndex(sars_df['Date']).year
sars_df['Month'] = pd.DatetimeIndex(sars_df['Date']).month
sars_df['Day'] = pd.DatetimeIndex(sars_df['Date']).day
sars_df.head()


Unnamed: 0,Date,Country,Confirmed_Cases,Deaths,Recoveries,Virus_Name,Year,Month,Day
0,2003-03-17,Germany,1,0,0,Sars,2003,3,17
1,2003-03-17,Canada,8,2,0,Sars,2003,3,17
2,2003-03-17,Singapore,20,0,0,Sars,2003,3,17
3,2003-03-17,"Hong Kong SAR, China",95,1,0,Sars,2003,3,17
4,2003-03-17,Switzerland,2,0,0,Sars,2003,3,17


Because each virus happens in different timeframes, we wanted to find a metric to set a common ground on the data. This creates ranking of dates to identify 1st to last confirmed case. We initially wanted to bin the data into groups such as  'first 30 days' or 'first 7 days' but this would limit the ability to query in the database. 

In [6]:
sars_df['Day #'] = (sars_df['Date'] - min(sars_df['Date'])).dt.days + 1
sars_df.head()

Unnamed: 0,Date,Country,Confirmed_Cases,Deaths,Recoveries,Virus_Name,Year,Month,Day,Day #
0,2003-03-17,Germany,1,0,0,Sars,2003,3,17,1
1,2003-03-17,Canada,8,2,0,Sars,2003,3,17,1
2,2003-03-17,Singapore,20,0,0,Sars,2003,3,17,1
3,2003-03-17,"Hong Kong SAR, China",95,1,0,Sars,2003,3,17,1
4,2003-03-17,Switzerland,2,0,0,Sars,2003,3,17,1


In [7]:
#check for naming conventions on countries and make sure they're consistent before uploading to the database. here for instance, china is listed multiple times under different names 
sars_df["Country"].unique()

array(['Germany', 'Canada', 'Singapore', 'Hong Kong SAR, China',
       'Switzerland', 'Thailand', 'Viet Nam', 'China', 'Taiwan, China',
       'Slovenia', 'United Kingdom', 'Spain', 'United States', 'Italy',
       'Republic of Ireland', 'France', 'Romania', 'Australia', 'Belgium',
       'Brazil', 'Malaysia', 'Kuwait', 'Japan', 'South Africa',
       'Indonesia', 'Philippines', 'Sweden', 'India', 'Mongolia',
       'Bulgaria', 'Republic of Korea', 'Macao SAR, China', 'Poland',
       'New Zealand', 'Colombia', 'Finland', 'Russian Federation'],
      dtype=object)

In [10]:
sars_df = sars_df.replace(to_replace ="Taiwan, China", 
                 value ="Taiwan") 

sars_df = sars_df.replace(to_replace ="Hong Kong SAR, China", 
                 value ="Hong Kong") 

sars_df = sars_df.replace(to_replace ="Macao SAR, China", 
                 value ="Macao") 

sars_df = sars_df.replace(to_replace ="Russian Federation", 
                 value ="Russia") 

sars_df = sars_df.replace(to_replace ="Viet Nam", 
                 value ="Vietnam") 

sars_df = sars_df.replace(to_replace ="Republic of Ireland", 
                 value ="Ireland") 

sars_df = sars_df.replace(to_replace ="Republic of Korea", 
                 value ="South Korea") 


sars_df

Unnamed: 0,Date,Country,Confirmed_Cases,Deaths,Recoveries,Virus_Name,Year,Month,Day,Day #
0,2003-03-17,Germany,1,0,0,Sars,2003,3,17,1
1,2003-03-17,Canada,8,2,0,Sars,2003,3,17,1
2,2003-03-17,Singapore,20,0,0,Sars,2003,3,17,1
3,2003-03-17,Hong Kong,95,1,0,Sars,2003,3,17,1
4,2003-03-17,Switzerland,2,0,0,Sars,2003,3,17,1
...,...,...,...,...,...,...,...,...,...,...
2533,2003-07-11,Switzerland,1,0,1,Sars,2003,7,11,117
2534,2003-07-11,Thailand,9,2,7,Sars,2003,7,11,117
2535,2003-07-11,United Kingdom,4,0,4,Sars,2003,7,11,117
2536,2003-07-11,United States,75,0,67,Sars,2003,7,11,117
