# Imports & Loading Data

In [2]:
import numpy as np
import pandas as pd
#import pycountry_convert as pc

pd.set_option('display.max_rows', 1000)
# dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'),datetime.datetime(2018, 1, 1)])
%run ./data_fetching_part01.ipynb # download latest data available
covid_data = pd.read_csv('./alldays_data.csv', parse_dates= ['Last_Update'],
date_parser = pd.to_datetime) # adjust later code for parsing date here

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)


Successfully saved: 08-16-2021.csv
Successfully saved: 08-17-2021.csv
Successfully saved: 08-18-2021.csv
Successfully saved: 08-19-2021.csv
Successfully saved: 08-20-2021.csv
Successfully saved: 08-21-2021.csv
Successfully saved: 08-22-2021.csv
Successfully saved: 08-23-2021.csv
Successfully saved: 08-24-2021.csv
Successfully saved: 08-25-2021.csv
Successfully saved: 08-26-2021.csv
Successfully saved: 08-27-2021.csv
Successfully saved: 08-28-2021.csv
Successfully saved: 08-29-2021.csv
Successfully saved: 08-30-2021.csv
Successfully saved: 08-31-2021.csv
Successfully saved: 09-01-2021.csv
Successfully saved: 09-02-2021.csv
Successfully saved: 09-03-2021.csv
Successfully saved: 09-04-2021.csv
Successfully saved: 09-05-2021.csv
Successfully saved: 09-06-2021.csv
Successfully saved: 09-07-2021.csv
Successfully saved: 09-08-2021.csv
Successfully saved: 09-09-2021.csv
Successfully saved: 09-10-2021.csv
Successfully saved: 09-11-2021.csv
Successfully saved: 09-12-2021.csv
Successfully saved: 

  exec(code_obj, self.user_global_ns, self.user_ns)


## Fixing String columns 
* some errands 
* strip whitespaces 
* fillna province with country 
* fillna combined_key

In [3]:
# some errands
some_corrections = {'Mainland China': 'China', 'US': 'USA', 'Korea, South': 'South Korea',
                    'Taiwan*' : 'Taiwan', 'Congo (Kinshasa)' : "Democratic Republic of the Congo",
                    "Cote d'Ivoire": "Côte d'Ivoire", "Reunion": "Réunion", 'UK': 'United Kingdom',
                    'Congo (Brazzaville)': 'Republic of the Congo', 'Bahamas, The': 'Bahamas',
                    'Gambia, The': 'Gambia', 'The Gambia': 'Gambia', 'West Bank and Gaza': 'Palestine',
                    'Burma': "Myanmar", 'Timor-Leste': "East Timor", 'Republic of Korea': 'South Korea',
                    'Iran (Islamic Republic of)': 'Iran', 'Viet Nam': 'Vietnam', 'Hong Kong SAR': 'Hong Kong',
                    'Russian Federation': 'Russia', 'occupied Palestinian territory': 'Palestine',
                     'The Bahamas': 'Bahamas', 'Macao SAR': 'Macau', 'Republic of Ireland': 'Ireland'}

covid_data['Country'] = covid_data['Country'].replace(some_corrections)

covid_data.loc[covid_data['Province']=='None', 'Province'] = np.nan
# striping leading and trailing whitespaces from string variables
covid_data[['Country', 'Province', 'Combined_Key']] = \
covid_data[['Country', 'Province', 'Combined_Key']].apply(lambda col: col.str.strip(), axis=0)
# fillna provoince with name of country
covid_data['Province'] = covid_data.apply(lambda x: x['Country'] if pd.isna(x['Province']) else x['Province'], axis = 1)
def fillna_combined_key(row):
    if pd.isna(row['Combined_Key']):
         # fill with country if no province-level country, with country & province otherwise
        row['Combined_Key'] = row['Country'] if row['Country'] == row['Province'] else row['Country'] + ', ' + row['Province']
    return row

covid_data = covid_data.apply(lambda row: fillna_combined_key(row), axis=1)

In [4]:
# Found that JHU, source, used a more disaggregated reporting starting from 1st Feb in some countries e.g. USA, Canada, Australia. So decided to avoid the problem it might do to the way I got new cases from accumulated cases, by dropping all before 1st Feb except China which accounted for majority of cases at the time.

covid_data = covid_data[(covid_data['Last_Update'] >= "2020-02-01 00:00:00") | covid_data['Country'].isin(['China', 'Macau', 'Hong Kong'])]

## Arranging data
* Arranging Columns
* two dataframes (Us vs Other World)


In [5]:
covid_data.columns = covid_data.columns.str.lower()
covid_data.rename(columns = {'province': 'state', 'last_update': 'date', 'combined_key': 'location'}, inplace=True)
covid_data = covid_data[['country', 'state', 'date', 'confirmed', 'deaths', 'recovered', 'active', 'location', 'fips']]

#### World data

In [6]:
df_world = covid_data[covid_data['country'] != 'USA'].copy()
df_world = df_world.drop(columns = 'fips') # only relevant for USA
df_world.head()

Unnamed: 0,country,state,date,confirmed,deaths,recovered,active,location
0,China,Anhui,2020-01-22 17:00:00,1.0,,,,"China, Anhui"
1,China,Beijing,2020-01-22 17:00:00,14.0,,,,"China, Beijing"
2,China,Chongqing,2020-01-22 17:00:00,6.0,,,,"China, Chongqing"
3,China,Fujian,2020-01-22 17:00:00,1.0,,,,"China, Fujian"
4,China,Gansu,2020-01-22 17:00:00,,,,,"China, Gansu"


In [6]:
df_world = df_world.drop_duplicates(subset=['country', 'state', 'date'], keep='last') # per_day cases: last report each day if more than one
# numeric columns
num_cols = ['confirmed', 'deaths', 'recovered', 'active']
df_world.loc[:, num_cols] = df_world.loc[:, num_cols].fillna(0)

# # per day cases (confirmed, deaths) Note: original data is accumulated over time as far as I know
# I guess there should be a better solution than looping on each group
df_world.sort_values(by=['country', 'state', 'date'], inplace=True) # I think sort here is important 
df_world.reset_index(drop = True, inplace=True)
grouped = df_world.groupby(['country', 'state'])
all_data = []
for _, group in grouped:
    for col in num_cols[:-1]:
        new_col = 'daily_' + col
        group[new_col] = group[col].diff(1)
        group.loc[group.index[0], new_col] = group.loc[group.index[0], col] # very first value the accumulated and daily col is same        
    all_data.append(group)
        
df_world = pd.concat(all_data, sort=False, ignore_index=True)
df_world.sort_values(by=['country', 'state', 'date'], inplace=True)

#### Per Country Cases

In [8]:
# per country cases
#1st grouping to get country or country with state data (i.e. agg daily data and last of accumulated)
per_country_cases = df_world.groupby(['country', 'state'], as_index=False).\
agg({'confirmed': 'last', 'deaths': 'last', 'recovered': 'last', 'active': 'median',
     'daily_confirmed':'sum', 'daily_deaths': 'sum', 'daily_recovered': 'sum'}) 
#2nd grouping to get country level from states (won't harm no-state level data)
per_country_cases = per_country_cases.groupby('country', as_index=False).sum()
per_country_cases = per_country_cases[['country', 'confirmed', 'deaths', 'recovered', 'active']]

In [9]:
per_country_cases.head()

Unnamed: 0,country,confirmed,deaths,recovered,active
0,Afghanistan,58730.0,2572.0,52392.0,5273.0
1,Albania,130409.0,2372.0,105016.0,5295.0
2,Algeria,120736.0,3198.0,84167.0,13555.5
3,Andorra,13024.0,124.0,12458.0,413.0
4,Angola,25492.0,577.0,23092.0,1125.0


#### daily cases

In [10]:
df_world['date'] = df_world['date'].dt.normalize() # drop unnecessary time part
df_daily = df_world.groupby(['country', 'state', 'date'], as_index=False).sum()
df_daily = df_daily[['country', 'state', 'date', 'daily_confirmed', 'daily_deaths', 'daily_recovered', 'active']]
df_daily.rename(columns = {col: col[6:] for col in ['daily_confirmed', 'daily_deaths', 'daily_recovered']}, inplace=True)
# dropping negative numbers from daily cases (I guess errors in reporting)
df_daily.loc[df_daily['confirmed'] < 0, 'confirmed'] = np.nan
df_daily.loc[df_daily['deaths'] < 0, 'deaths'] = np.nan
df_daily.loc[df_daily['recovered'] < 0, 'recovered'] = np.nan
df_daily.dropna(subset=['confirmed', 'deaths', 'recovered'], inplace=True)

In [11]:
df_daily.head()

Unnamed: 0,country,state,date,confirmed,deaths,recovered,active
0,Afghanistan,Afghanistan,2020-02-24,1.0,0.0,0.0,0.0
1,Afghanistan,Afghanistan,2020-03-08,3.0,0.0,0.0,0.0
2,Afghanistan,Afghanistan,2020-03-10,1.0,0.0,0.0,0.0
3,Afghanistan,Afghanistan,2020-03-11,2.0,0.0,0.0,0.0
4,Afghanistan,Afghanistan,2020-03-14,4.0,0.0,0.0,0.0


#### Adding continent column
* Better after aggregation

In [12]:
# Continent_code to Continent_names
continents = {
    'NA': 'North America',
    'SA': 'South America', 
    'AS': 'Asia',
    'OC': 'Australia',
    'AF': 'Africa',
    'EU' : 'Europe',
    'na' : 'Others'
}

def country_to_continent_code(country):
    try:
        return pc.country_alpha2_to_continent_code(pc.country_name_to_country_alpha2(country))
    except:
        return "na"

# insert continent column
df_daily.insert(0, "continent", df_daily['country'].apply(lambda x: continents[country_to_continent_code(x)]))
per_country_cases.insert(0, "continent", per_country_cases['country'].apply(lambda x: continents[country_to_continent_code(x)]))

### per country cases: from worldometer

In [13]:
df_other = pd.read_csv('./world_worldometer.csv')
columns = {'Continent': 'continent', 'Country Other':'country', 'TotalCases': 'confirmed', 'TotalDeaths': 'deaths',
          'TotalRecovered': 'recovered', 'ActiveCases': 'active', 'TotalTests': 'tests', 'Population':'population'}
df_other = df_other.rename(columns = columns)
df_other = df_other[[col for _, col in columns.items()]]
df_other = df_other[1:]
df_other.sort_values('country', inplace=True)
df_other.head()

Unnamed: 0,continent,country,confirmed,deaths,recovered,active,tests,population
103,Asia,Afghanistan,59021,2592.0,52489,3940,395439.0,39638567.0
83,Europe,Albania,130537,2378.0,105728,22431,622711.0,2875230.0
84,Africa,Algeria,120922,3207.0,84299,33416,230861.0,44493653.0
137,Europe,Andorra,13060,124.0,12491,445,193595.0,77366.0
120,Africa,Angola,25609,579.0,23092,1938,455499.0,33691594.0


#### USA
* stopped working on it, it seems some data are cumulative, others are new cases, not sure how to handle
* using worldometer data as a cross-section for latest USA data

In [14]:
df_us = pd.read_csv('./usa_worldometer.csv')
columns = {'USAState':'state', 'TotalCases': 'confirmed', 'TotalDeaths': 'deaths',
          'ActiveCases': 'active', 'TotalTests': 'tests'}
df_us = df_us.rename(columns = columns)
df_us.insert(0, 'country', 'USA')
df_us.insert(0, 'continent', 'North America')
df_us.insert(0, 'date', pd.Timestamp.today().normalize())
df_us = df_us[['date', 'continent', 'country', 'state', 'confirmed', 'deaths', 'active', 'tests']]
df_us = df_us[1:]
df_us.head() # can be used for state-level analysis

Unnamed: 0,date,continent,country,state,confirmed,deaths,active,tests
1,2021-04-26,North America,USA,California,3732256,61479,1687173.0,59095717
2,2021-04-26,North America,USA,Texas,2877774,50176,91177.0,27678766
3,2021-04-26,North America,USA,Florida,2208584,34861,414122.0,27309151
4,2021-04-26,North America,USA,New York,2077439,52242,585139.0,50361096
5,2021-04-26,North America,USA,Illinois,1321033,24139,97476.0,22269555


In [15]:
us_row = df_us[['confirmed', 'deaths', 'active']].sum()
us_row['country'] = 'USA'
us_row['continent'] = 'North America'

per_country_cases = per_country_cases.append(us_row, ignore_index=True, sort=False)
per_country_cases.sort_values('country', inplace=True)
df_daily = df_daily.append(df_us, sort=False, ignore_index=True)
df_daily.sort_values(['country', 'state', 'date'], inplace=True)

# Saving Cleaned Data to csv

In [16]:
import os
if not os.path.exists('./cleaned_data'):
    os.mkdir('cleaned_data')
per_country_cases.to_csv('./cleaned_data/per_country_aggregate.csv', index=False)
df_other.to_csv('./cleaned_data/allcountries_worldometer.csv', index=False)
print('Successfully saved: ./cleaned_data/allcountries_worldometer.csv')
print('Successfully saved: ./cleaned_data/per_country_aggregate.csv')
df_daily.to_csv('./cleaned_data/daily_disagg.csv', index=False)
print('Successfully saved: ./cleaned_data/daily_disagg.csv')
df_us.to_csv('./cleaned_data/usa_states.csv', index=False)
print('Successfully saved: ./cleaned_data/usa_states.csv')

Successfully saved: ./cleaned_data/allcountries_worldometer.csv
Successfully saved: ./cleaned_data/per_country_aggregate.csv
Successfully saved: ./cleaned_data/daily_disagg.csv
Successfully saved: ./cleaned_data/usa_states.csv


#   ------------------------------ DRAFT ---------------------------------

In [16]:
#### maybe not needed after fillna state
# #knowing which countries have province level data
# countries = covid_data['country'].unique()
# cntry_level = []
# province_level = []
# for cntry in countries:
#     filt = (covid_data['country'] == cntry)
#     only_cntry_level = covid_data.loc[filt, 'Province'].isna().all()
#     if only_cntry_level:
#         cntry_level.append(cntry)
#     else:
#         province_level.append(cntry)

# province_level.remove('USA')    # has different data source, look below
# #province_level.remove('Macao SAR')
# province_level.remove('Macau')
# province_level.remove('Taipei and environs')
# province_level.remove('Taiwan')
# cntry_level.append('Taiwan')
# province_level.remove('Hong Kong')
# cntry_level.append('Hong Kong')
# province_level.remove('Israel')
# cntry_level.append('Israel')
# province_level.remove('Cruise Ship')
# province_level.remove('Others')

    
# print('Counties with no province data: \n', sorted(cntry_level), 
#     '\n\n', 'Countries with province data: \n', sorted(province_level))

# Function return data per country for countries with no subnational levels

In [17]:
# def covid_by_country(country_='Egypt'):
#     """ return a dataframe of daily cases in country_"""
#     if country_ in cntry_level:
#         covid_country = covid_data.loc[covid_data['Country']==country_]
#         covid_country.sort_values(by='Date', inplace=True, ignore_index=True)
#         covid_country['report_id'] = covid_country.groupby('Date').ngroup()
#         #old way
#         # covid_country['report_id'] = covid_country.groupby('Date').cumcount()==0).astype(int).cumsum()
#         covid_country = covid_country.groupby('report_id').tail(1) # last update within a day nth(-1) or nth([-1])
#         # covid_country = covid_country.groupby([pd.Grouper(freq='1d', key='date')]).tail() # if 'date' is datetime not object
#         covid_country.rename(columns={'Confirmed':'AccConfirmed',
#                             'Deaths': 'AccDeaths', 'Recovered':'AccRecovered'}, inplace=True)
#         covid_country.set_index('Date', inplace=True)

#         covid_country['confirmed'] = covid_country['AccConfirmed'].diff(1)
#         covid_country.loc[covid_country.index[0], 'confirmed'] = covid_country.loc[covid_country.index[0], 'AccConfirmed']
#         covid_country['deaths'] = covid_country['AccDeaths'].diff(1)
#         covid_country.loc[covid_country.index[0], 'deaths'] = covid_country.loc[covid_country.index[0], 'AccDeaths']
#         covid_country['recovered'] = covid_country['AccRecovered'].diff(1)
#         covid_country.loc[covid_country.index[0], 'recovered'] = covid_country.loc[covid_country.index[0], 'AccRecovered']

#         covid_country.loc[covid_country['confirmed'] < 0, 'confirmed'] = np.nan
#         covid_country.loc[covid_country['deaths'] < 0, 'deaths'] = np.nan
#         covid_country.loc[covid_country['recovered'] < 0, 'recovered'] = np.nan

#         covid_country.dropna(thresh=2, inplace=True, subset=['AccConfirmed', 'AccDeaths',
#                                             'AccRecovered', 'confirmed', 'deaths', 'recovered'])
#         covid_country = covid_country.reindex(columns = ['AccConfirmed', 'AccDeaths', 'AccRecovered', 'confirmed', 'deaths', 'recovered', 'Latitude', 'Longitude'])

#         return covid_country

# egy_covid = covid_by_country()
# egy_covid.head()

In [18]:
# egy_covid.groupby('Date').expanding().agg('count') # check later to create id

# Function return data per country for countries with subnational levels 

In [19]:
# def covid_by_country_with_states(country_='Australia'):
#     """ return a dataframe of daily cases in country_"""
#     if country_ in province_level:
#         covid_country = covid_data.loc[covid_data['Country']==country_]
#         # a placeholder for nan in provinces
#         covid_country['Province']=covid_country.apply(lambda x: x['Country'] if pd.isnull(x['Province']) else x['Province'],axis=1)
        
#         covid_country.sort_values(by=['Province', 'Date'], inplace=True)
#         covid_country['report_id'] = (covid_country.groupby(['Province', 'Date']).cumcount()==0).astype(int).cumsum()
#         covid_country = covid_country.groupby('report_id', as_index=False).last() # last update within a day #last or first could be ok
#         # instead of reindexing as_index argument for groupby is better
#         ##covid_country.reset_index(drop=True, inplace=True)
#         covid_country.rename(columns={'Confirmed':'AccConfirmed',
#                              'Deaths': 'AccDeaths', 'Recovered':'AccRecovered'}, inplace=True)
#         covid_country.set_index(['Province', 'Date'], inplace=True)
    
#         grouped_data = covid_country.groupby(level='Province')
#         all_provinces_df = []
#         for prov, df in grouped_data:
#             df['confirmed'] = df['AccConfirmed'].diff(1)
#             df.loc[df.index[0], 'confirmed'] = df.loc[df.index[0], 'AccConfirmed']
#             df['deaths'] = df['AccDeaths'].diff(1)
#             df.loc[df.index[0], 'deaths'] = df.loc[df.index[0], 'AccDeaths']
#             df['recovered'] = df['AccRecovered'].diff(1)
#             df.loc[df.index[0], 'recovered'] = df.loc[df.index[0], 'AccRecovered']

#             # df.loc[df['confirmed'] < 0, 'new_confirmed'] = np.nan
#             # df.loc[df['deaths'] < 0, 'new_deaths'] = np.nan
#             # df.loc[df['recovered'] < 0, 'new_recovered'] = np.nan

#             all_provinces_df.append(df)
        
#         all_df = pd.concat(all_provinces_df, sort=True, verify_integrity=True)
#         # all_df.dropna(thresh=2, inplace=True, subset=['AccConfirmed', 'AccDeaths',
#         #                                         'AccRecovered', 'confirmed', 'deaths', 'recovered'])
#         all_df = all_df.reindex(columns = ['AccConfirmed', 'AccDeaths', 'AccRecovered', 
#                                           'confirmed', 'deaths', 'recovered', 'Latitude', 'Longitude'])

#         return all_df
# ausy = covid_by_country_with_states()
# ausy.head()

In [20]:
# # It seems working for all countries except USA as it has more disaggregated data ; tried with Italy, China, Australia
# # Australia
# ## disaggregated
# # aus_covid_disagg = covid_by_country_with_states()
# ## by state
# # check grouping with NaN as it is dropped
# aus_covid_by_state = aus_covid_disagg.groupby(level='Province')['confirmed', 'deaths', 'recovered'].sum()
# ## by date
# # aus_covid_by_date = aus_covid_disagg.groupby(level='Date')['confirmed', 'deaths', 'recovered'].sum()
# # aus_covid_disagg.head(1000)

# # {'Australian Capital Territory': 'ACT',
# #  'External territories': 'ET',
# #  'From Diamond Princess': 'FDP',
# #  'Jervis Bay Territory': 'JBT',
# #  'New South Wales': 'NSW',
# #  'Northern Territory': 'NT',
# #  'Queensland': 'QLD',
# #  'South Australia': 'SA',
# #  'Tasmania': 'TAS',
# #  'Victoria': 'VIC',
# #  'Western Australia': 'WA',
# #  'Australia': 'AUS'}

In [21]:
# # Final Dataset for all countries shwoing daily cases
# xs = covid_final.groupby('Country').sum().astype(int).sort_values(by=['confirmed', 'deaths'], ascending=False)
# filt_out = (xs['confirmed']< 1000) #gathering countries with covid < 1000 cases
# add_sum = xs.loc[filt_out].sum() # their sum
# xs = xs.loc[- filt_out] # removing them
# add_sum.name= 'Others' # giving the series a name (index)
# xs.append(add_sum)
# xs.head(1000)

In [22]:
# # dates (having month of date index)
# vic_covid = aus_covid_disagg.loc['Victoria']
# vic_covid['Date'] = vic_covid.index
# vic_covid['Month'] = vic_covid['Date'].dt.month
# aus_covid = aus_covid_disagg.groupby(level='Date')['confirmed', 'deaths', 'recovered'].sum()
# vic_covid = vic_covid.reindex(columns = aus_covid.columns)

# aus_covid = aus_covid.rename({col: 'nat_' + col for col in aus_covid.columns}, axis=1)
# df_vic_aus = pd.merge(vic_covid, aus_covid, how="outer", left_index=True, right_index=True)
# df_vic_aus[['confirmed', 'nat_confirmed']].resample('W').sum().plot(kind='bar', figsize=(15,10));

In [23]:
# # bad dates (from Mukti)
# import pandas as pd
# import numpy as np
# df = pd.read_excel(r'C:\Users\khalil\OneDrive - Deakin University\shared_folder\PhD Project\Programming\extra\bad_dates.xlsx')
# df[['day', 'month', 'year']] = df['TnC'].str.split('.', expand=True).astype(int)
# df['year'] = np.where(df['year'] <21, df['year'] + 2000, df['year'])
# df['year'] = np.where(df['year'] <99, df['year'] + 1900, df['year'])
# df['year'] = np.where(df['year'] <999, df['year'] + 1000, df['year'])
# df['date'] = pd.to_datetime(df[['day', 'month', 'year']].astype(str), dayfirst=True)
# df.head()

In [24]:
# parsing date from components
# df = pd.read_csv('temp.csv', parse_dates=[['day', 'month', 'year']], dayfirst=True, usecols= lambda x: x not in ['DD'], index_col=0)
# df.head()