# Libraries

In [1]:
# libraries
# ----------

import requests
import pandas as pd
import wget

# Downloading data

In [2]:
# remove existing files
! rm *.csv

# urls of the files
urls = ['https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv', 
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv']

# download files
for url in urls:
    filename = wget.download(url)

100% [..............................................................................] 88689 / 88689

# Dataframes

In [3]:
# dataset
# --------

conf_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('time_series_covid19_deaths_global.csv')
recv_df = pd.read_csv('time_series_covid19_recovered_global.csv')

In [4]:
# conf_df.head()
# deaths_df.head()
# recv_df.head()

In [5]:
conf_df.columns
# deaths_df.columns
# recv_df.columns

Index(['Province/State', 'Country/Region', 'Lat', 'Long', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       ...
       '5/6/20', '5/7/20', '5/8/20', '5/9/20', '5/10/20', '5/11/20', '5/12/20',
       '5/13/20', '5/14/20', '5/15/20'],
      dtype='object', length=119)

In [6]:
conf_df.columns[4:]

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '5/6/20', '5/7/20', '5/8/20', '5/9/20', '5/10/20', '5/11/20', '5/12/20',
       '5/13/20', '5/14/20', '5/15/20'],
      dtype='object', length=115)

# Merging dataframes

In [7]:
dates = conf_df.columns[4:]

conf_df_long = conf_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Confirmed')

deaths_df_long = deaths_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Deaths')

recv_df_long = recv_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Recovered')

recv_df_long = recv_df_long[recv_df_long['Country/Region']!='Canada']

print(conf_df_long.shape)
print(deaths_df_long.shape)
print(recv_df_long.shape)

(30590, 6)
(30590, 6)
(28980, 6)


In [8]:
# full_table = pd.concat([conf_df_long, deaths_df_long['Deaths'], recv_df_long['Recovered']], 
#                        axis=1, sort=False)

full_table = pd.merge(left=conf_df_long, right=deaths_df_long, how='left',
                      on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long'])
full_table = pd.merge(left=full_table, right=recv_df_long, how='left',
                      on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long'])

full_table.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.0,65.0,1/22/20,0,0,0.0
1,,Albania,41.1533,20.1683,1/22/20,0,0,0.0
2,,Algeria,28.0339,1.6596,1/22/20,0,0,0.0
3,,Andorra,42.5063,1.5218,1/22/20,0,0,0.0
4,,Angola,-11.2027,17.8739,1/22/20,0,0,0.0


In [9]:
full_table.shape

(30590, 8)

In [10]:
full_table.isna().sum()

Province/State    21275
Country/Region        0
Lat                   0
Long                  0
Date                  0
Confirmed             0
Deaths                0
Recovered          1955
dtype: int64

In [11]:
full_table[full_table['Recovered'].isna()]['Country/Region'].value_counts()

Canada         1610
Mozambique      115
Timor-Leste     115
Syria           115
Name: Country/Region, dtype: int64

In [12]:
full_table[full_table['Recovered'].isna()]['Date'].value_counts()

2/16/20    17
3/20/20    17
3/14/20    17
3/10/20    17
3/31/20    17
           ..
4/15/20    17
4/19/20    17
5/15/20    17
2/12/20    17
3/30/20    17
Name: Date, Length: 115, dtype: int64

In [13]:
full_table['Recovered'] = full_table['Recovered'].fillna(0)
full_table['Recovered'] = full_table['Recovered'].astype('int')
full_table.isna().sum()

Province/State    21275
Country/Region        0
Lat                   0
Long                  0
Date                  0
Confirmed             0
Deaths                0
Recovered             0
dtype: int64

# Preprocessing

In [14]:
# renaming
# ========

# renaming countries, regions, provinces
full_table['Country/Region'] = full_table['Country/Region'].replace('Korea, South', 'South Korea')

In [15]:
# removing
# =======

# removing canada's recovered values
full_table = full_table[full_table['Province/State'].str.contains('Recovered')!=True]

# removing county wise data to avoid double counting
full_table = full_table[full_table['Province/State'].str.contains(',')!=True]

# Fixing off data

In [16]:
# new values
feb_12_conf = {'Hubei' : 34874}

In [17]:
# function to change value
def change_val(date, ref_col, val_col, dtnry):
    for key, val in dtnry.items():
        full_table.loc[(full_table['Date']==date) & (full_table[ref_col]==key), val_col] = val

In [18]:
# changing values
change_val('2/12/20', 'Province/State', 'Confirmed', feb_12_conf)

In [19]:
# checking values
full_table[(full_table['Date']=='2/12/20') & (full_table['Province/State']=='Hubei')]

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
5648,Hubei,China,30.9756,112.2707,2/12/20,34874,1068,2686


# Saving final data

In [20]:
full_table.to_csv('covid_19_clean_complete.csv', index=False)

# Country wise data

In [21]:
# china
# =====

china_province_wise = full_table[full_table['Country/Region']=='China']
china_province_wise['Province/State'].unique()
china_province_wise.to_csv('china_province_wise.csv', index=False)

In [22]:
# Australia
# =========

australia_state_wise = full_table[full_table['Country/Region']=='Australia']
australia_state_wise['Province/State'].unique()
australia_state_wise.to_csv('australia_state_wise.csv', index=False)

In [23]:
# Canada
# ======

canada_state_wise = full_table[full_table['Country/Region']=='Canada']
canada_state_wise['Province/State'].unique()
canada_state_wise.to_csv('canada_state_wise.csv', index=False)

# USA data

In [24]:
# urls of the files
urls = ['https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv', 
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv']

# download files
for url in urls:
    filename = wget.download(url)

100% [..........................................................................] 1104566 / 1104566

In [25]:
us_conf_df = pd.read_csv('time_series_covid19_confirmed_US.csv')
us_deaths_df = pd.read_csv('time_series_covid19_deaths_US.csv')

In [26]:
# us_conf_df.head()
# us_deaths_df.head()

In [27]:
# us_conf_df.columns
# us_deaths_df.columns

In [28]:
ids = us_conf_df.columns[0:11]
us_dates = us_conf_df.columns[11:]

us_conf_df_long = us_conf_df.melt(id_vars=ids, value_vars=us_dates, var_name='Date', value_name='Confirmed')
us_deaths_df_long = us_deaths_df.melt(id_vars=ids, value_vars=us_dates, var_name='Date', value_name='Deaths')

In [29]:
us_conf_df_long.head()
# us_deaths_df_long.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,"American Samoa, US",1/22/20,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,"Guam, US",1/22/20,0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,"Northern Mariana Islands, US",1/22/20,0
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,"Puerto Rico, US",1/22/20,0
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,"Virgin Islands, US",1/22/20,0


In [30]:
print(us_conf_df_long.shape)
print(us_deaths_df_long.shape)

(375015, 13)
(375015, 13)


In [31]:
ft_ids = us_conf_df_long.columns[:-1]
ft_ids

Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key', 'Date'],
      dtype='object')

In [32]:
us_full_table = pd.concat([us_conf_df_long, us_deaths_df_long[['Deaths']]], axis=1)
us_full_table.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed,Deaths
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,"American Samoa, US",1/22/20,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,"Guam, US",1/22/20,0,0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,"Northern Mariana Islands, US",1/22/20,0,0
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,"Puerto Rico, US",1/22/20,0,0
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,"Virgin Islands, US",1/22/20,0,0


In [33]:
us_full_table.to_csv('usa_county_wise.csv', index=False)

# Tests data from https://www.worldometers.info/coronavirus/

In [34]:
url = 'https://www.worldometers.info/coronavirus/'

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(url, headers=header)

In [35]:
tests = pd.read_html(r.text)[0]
tests.head()

Unnamed: 0,#,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop,Population
0,,World,4628724,7310.0,308645.0,491.0,1759589.0,2560490.0,45008.0,594.0,39.6,,,
1,1.0,USA,1484285,,88507.0,,327751.0,1068027.0,16139.0,4488.0,268.0,11090900.0,33532.0,330758784.0
2,2.0,Spain,274367,,27459.0,,188967.0,57941.0,1320.0,5868.0,587.0,2467761.0,52783.0,46752556.0
3,3.0,Russia,262843,,2418.0,,58226.0,202199.0,2300.0,1801.0,17.0,6413948.0,43953.0,145926781.0
4,4.0,UK,236711,,33998.0,,,,1559.0,3489.0,501.0,2353078.0,34685.0,67841324.0


In [36]:
tests.columns

Index(['#', 'Country,Other', 'TotalCases', 'NewCases', 'TotalDeaths',
       'NewDeaths', 'TotalRecovered', 'ActiveCases', 'Serious,Critical',
       'Tot Cases/1M pop', 'Deaths/1M pop', 'TotalTests', 'Tests/ 1M pop',
       'Population'],
      dtype='object')

In [37]:
tests.columns = ['#', 'Country', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths',
       'TotalRecovered', 'ActiveCases', 'Serious,Critical', 'Cases per 1M pop',
       'Deaths per 1M pop', 'Total Tests', 'Tests per 1M pop', 'Populatio']

In [38]:
tests = tests.loc[1:len(tests)-2, ['Country', 'Cases per 1M pop', 'Deaths per 1M pop', 'Total Tests', 'Tests per 1M pop']]

In [39]:
# tests.columns = ['Country', 'Cases/1M Pop', 'Deaths/1M Pop', 'Total Tests', 'Tests/1M Pop']
tests

Unnamed: 0,Country,Cases per 1M pop,Deaths per 1M pop,Total Tests,Tests per 1M pop
1,USA,4488.0,268.0,11090900.0,33532.0
2,Spain,5868.0,587.0,2467761.0,52783.0
3,Russia,1801.0,17.0,6413948.0,43953.0
4,UK,3489.0,501.0,2353078.0,34685.0
5,Italy,3702.0,523.0,2875680.0,47553.0
...,...,...,...,...,...
211,Western Sahara,10.0,,,
212,Anguilla,200.0,,,
213,Lesotho,0.5,,,
214,Saint Pierre Miquelon,173.0,,,


In [40]:
tests.to_csv('tests.csv', index=False)