In [8]:
import csv
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import date 
import os
from os import listdir
from os.path import isfile, join

In [2]:
url_xls = "https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide"
url_csv = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
url_countries_csv = "https://raw.githubusercontent.com/dbouquin/IS_608/master/NanosatDB_munging/Countries-Continents.csv"

In [3]:
page = requests.get(url_xls).text

In [4]:
soup = BeautifulSoup(page, 'html.parser')

In [5]:
data_attribute = soup.find_all("a", attrs={"data-toggle": "tooltip"})

In [6]:
for link in data_attribute:
    data_link = link.get('href')

In [21]:
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(os.path.abspath(''))))
timeseries_combined_data_path = os.path.join(BASE_DIR, 'covid/data/timeseries/daily_combined')
timeseries_split_data_path = os.path.join(BASE_DIR, 'covid/data/timeseries/daily_split')

timeseries_combined_files = [f for f in listdir(timeseries_combined_data_path) if isfile(join(timeseries_combined_data_path, f))]
timeseries_split_files = [f for f in listdir(timeseries_split_data_path) if isfile(join(timeseries_split_data_path, f))]
latest_data_file = max(timeseries_combined_files)
latest_data_date = str(latest_data_file)[0:10]

today = str(date.today())

In [36]:
#Create dataset paths
combined_path=str(timeseries_combined_data_path) + '/' + str(latest_data_date) + '-combined.csv'
confirmed_split_path=str(timeseries_split_data_path) + '/' + str(latest_data_date) + 'confirmed.csv'
deaths_split_path=str(timeseries_split_data_path) + '/' + str(latest_data_date) + 'deaths.csv'
recovered_split_path=str(timeseries_split_data_path) + '/' + str(latest_data_date) + 'recovered.csv'
print(combined_path, confirmed_split_path, deaths_split_path, recovered_split_path)

/home/bking/Projects/poetry/covid/covid/data/timeseries/daily_combined/2020-04-07-combined.csv /home/bking/Projects/poetry/covid/covid/data/timeseries/daily_split/2020-04-07confirmed.csv /home/bking/Projects/poetry/covid/covid/data/timeseries/daily_split/2020-04-07deaths.csv /home/bking/Projects/poetry/covid/covid/data/timeseries/daily_split/2020-04-07recovered.csv


In [25]:
timeseries_split_data_path

'/home/bking/Projects/poetry/covid/covid/data/timeseries/daily_split'

In [20]:
print(timeseries_combined_files)
print(max(timeseries_combined_files))

['2020-04-07-combined.csv', '2020-03-24-combined.csv']
2020-04-07-combined.csv


In [74]:
class CovidDataset:
    
    ### INITIALIZE ###
    def __init__(self):
        # Create dataset paths and get latest data
        self.BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(os.path.abspath(''))))
        self.reference_data_path = os.path.join(BASE_DIR, 'covid/data/reference')
        self.timeseries_combined_data_path = os.path.join(BASE_DIR, 'covid/data/timeseries/daily_combined')
        self.timeseries_split_data_path = os.path.join(BASE_DIR, 'covid/data/timeseries/daily_split')
        self.timeseries_combined_files = [f for f in listdir(timeseries_combined_data_path) if isfile(join(timeseries_combined_data_path, f))]
        self.timeseries_split_files = [f for f in listdir(timeseries_split_data_path) if isfile(join(timeseries_split_data_path, f))]
        latest_data_file = max(timeseries_combined_files)
        self.latest_data_date = str(latest_data_file)[0:10]
        
        # Compile dictionary of data sources
        self.sources = {
            "confirmed": {
                "url": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
                "local_path": str(self.timeseries_split_data_path) + '/' + str(self.latest_data_date) + 'confirmed.csv'
            },
            "recovered": {
                "url": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv",
                "local_path": str(self.timeseries_split_data_path) + '/' + str(self.latest_data_date) + 'recovered.csv'    
            },
            "deaths": {
                "url": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
                "local_path": str(self.timeseries_split_data_path) + '/' + str(self.latest_data_date) + 'deaths.csv'
            },
            "combined": {
                "local_path": str(timeseries_combined_data_path) + '/' + str(latest_data_date) + '-combined.csv'
            },
        }
        self.confirmed_data_raw = {
            'new': self.getNewData('confirmed'),
            'old': self.getOldData('confirmed')
            }
        self.recovered_data_raw = {
            'new': self.getNewData('recovered'),
            'old': self.getOldData('recovered')
            }
        self.deaths_data_raw = {
            'new': self.getNewData('deaths'),
            'old': self.getOldData('deaths')
            }
        self.full_dataset_raw = [self.confirmed_data_raw, self.recovered_data_raw, self.deaths_data_raw]

        self.reference_data = self.loadReferenceData()
        self.needs_reference_data_refresh = False
        self.needs_timeseries_data_refresh = False
    
    ### CREATE DATASETS (REFERENCE AND TIMESERIES) ###
    
    def createNewReferenceData(self):
        # Continents reference data
        continents_df = []
        continents = ["Africa", "Antarctica", "Asia", "Europe", "North America", "Oceania", "South America", "None"]
        continents_df = pd.DataFrame(continents)
        continent_ids = list(range(0, len(continents)))
        continent_id_column_name = 'continent_id'
        continents_df[continent_id_column_name] = continent_ids
        continents_df.columns = ['name', 'continent_id']
        # Countries-continents reference data
        base_path = self.reference_data_path
        countries_continents_path = base_path + "/country_continent.csv"
        countries_continents_df=pd.read_csv(countries_continents_path, index_col=0)
        # Types reference data
        types_df = pd.DataFrame(["Confirmed", "Recovered", "Deaths"])
        type_ids = list(range(0, len(types_df)))
        type_id_column_name = 'type_id'
        types_df[type_id_column_name] = type_ids
        types_df.columns = ['name', 'type_id']
        # Province/State reference data
        province_state_df = self.deduplicate("Province/State", by_column_range=True, up_to_column=4)
        # Country reference data
        country_df = self.deduplicate("Country/Region", by_column_range=True, up_to_column=4, summarize=True, summary_column='Country/Region')
        province_state_df_merged = pd.merge(province_state_df, country_df, how='inner', on='Country/Region', left_index=False, right_index=False, sort=False,
         suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
        province_state_df_merged = province_state_df_merged.drop('Country/Region', axis=1).drop('Lat_y', axis=1).drop('Long_y', axis=1)
        province_state_df_merged.columns = ['name', 'lat', 'long', 'province_state_id', 'country_id']
        country_df_merged = pd.merge(country_df, countries_continents_df, how='inner', on='Country/Region', left_index=False, right_index=False, sort=False,
         suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
        country_df_merged = country_df_merged.drop('Continent', axis=1)
        country_df_merged.columns = ['name', 'lat', 'long', 'country_id', 'continent_id']
        # Combine all reference data into a list
        reference_data_list = [continents_df, countries_continents_df, country_df_merged, province_state_df_merged, types_df]
        keys = ['continent', 'country_continent','country', 'province_state', 'types']
        values = reference_data_list
        reference_data_dict = dict(zip(keys, values))
        self.new_reference_data = reference_data_dict
        return reference_data_dict
    
    def loadReferenceData(self):
        base_path = self.reference_data_path
        continent_path = base_path + "/continent.csv"
        country_path = base_path + "/country.csv"
        province_state_path = base_path + "/province_state.csv"
        countries_continents_path = base_path + "/country_continent.csv"
        types_path = base_path + "/types.csv"
        reference_data_paths = [continent_path, countries_continents_path, country_path, province_state_path, types_path]
        reference_data_list = []
        for path in reference_data_paths:
            df = pd.read_csv(path, index_col=0)
            reference_data_list.append(df)
        keys = ['continent', 'country_continent','country', 'province_state', 'types']
        values = reference_data_list
        reference_data_dict = dict(zip(keys, values))
        return reference_data_dict
    
    ### ACTIONS ###
    def getNewData(self, data_type):
        """Input url of csv, returns dataframe"""
        df = pd.read_csv(self.sources[str(data_type)]['url'])
        return df

    def getOldData(self, data_type):
        """Input filepath of csv, returns dataframe"""
        df = pd.read_csv(self.sources[str(data_type)]['local_path'])
        return df
    
    def deduplicate(self, column_name: str, by_column_range=False, up_to_column=4, summarize=False, summary_column='test'):
        item_list = []
        if by_column_range==True and summarize==False:
            for dataset in self.full_dataset_raw:
                items = pd.DataFrame(dataset['new'].iloc[:,0:up_to_column].drop_duplicates(keep='first').dropna().reset_index().drop(["index"],axis=1))
                item_list.append(items)
        elif by_column_range==True and summarize==True:
            for dataset in self.full_dataset_raw:
                data_by_column = pd.DataFrame(dataset['new'].iloc[:,0:up_to_column]).groupby(summary_column, as_index=False).mean()
                items = data_by_column.iloc[:,0:up_to_column].drop_duplicates(subset=summary_column, keep='first').dropna().reset_index()
                item_list.append(data_by_column)
        else:
            for dataset in self.full_dataset_raw:
                items = pd.DataFrame(dataset['new'][[column_name]].drop_duplicates(keep='first').dropna().reset_index().drop(["index"],axis=1))
                item_list.append(items)
        concatenated_df = pd.concat([item_list[0], item_list[1], item_list[2]], ignore_index=True)
        item_df = concatenated_df.drop_duplicates(subset=column_name, keep='first')
        item_ids = list(range(0, len(item_df)))
        id_column_name = column_name + '_id'
        item_df[id_column_name] = item_ids
        return item_df
    
    def saveData(self):
        timeseries_filenames = ["confirmed", "recovered", "deaths"]
        reference_filenames = ['continent', 'country_continent', 'country', 'province_state', 'types']
        base_dir_split = self.timeseries_split_data_path
        base_dir_combined = self.timeseries_combined_data_path
        base_dir_reference = self.reference_data_path
        for i, item in enumerate(self.full_dataset_cleaned_list):
            filename = base_dir_split + '/' + str(date.today()) + str(timeseries_filenames[i]) + ".csv" 
            item.to_csv(filename)
        filename_combined = base_dir_combined + '/' + str(date.today()) + "-combined.csv" 
        self.full_dataset_cleaned_combined.to_csv(filename_combined)
        
        for i, item in enumerate(self.new_reference_data.items()):
            filename = base_dir_reference + '/' + str(reference_filenames[i]) + ".csv"
            if reference_filenames[i]=='country_continent':
                continue
            item[1].to_csv(filename, index=False)
            
    def standardizeNewData(self):
        full_dataset_cleaned = []
        for i, dataset in enumerate(self.full_dataset_raw):
            # Join country_id, prov_state_id
            dataset_merged_country = pd.merge(dataset['new'], self.new_reference_data['country'].iloc[:,[0,3]], how='inner', on=None, left_on='Country/Region', right_on='name', sort=False,
                               suffixes=('_x', '_y'), copy=False, indicator=False, validate=None)
            dataset_merged_province = pd.merge(dataset_merged_country, self.new_reference_data['province_state'].iloc[:,[0,3]], how='left', on=None, left_on='Province/State', right_on='name', sort=False,
                               suffixes=('_x', '_y'), copy=False, indicator=False, validate=None)
            # Drop Lat, Long, Province/State, Country/Region
            cases = dataset_merged_province.drop('Country/Region', axis=1).drop("name_x", axis=1).drop("name_y", axis=1).drop("Lat", axis=1).drop("Long", axis=1).drop("Province/State", axis=1)
            # Melt dataframe using date columns as rows
            cases_melt = pd.melt(cases, id_vars=['country_id', 'province_state_id'], value_vars=['1/22/20'])
            # Join all melted time series columns
            for j, column in enumerate(cases.iloc[:,1:-2]):
                melted_df = pd.melt(cases, id_vars=['country_id', 'province_state_id'], value_vars=[column])
                cases_melt = cases_melt.append(melted_df)
            cases_melt.columns = ['country_id', 'province_state_id', 'date', 'count']
            cases_melt['date'] = pd.to_datetime(cases_melt['date'],infer_datetime_format=True)
            cases_melt["country_id"] = pd.to_numeric(cases_melt["country_id"], downcast='integer')
            cases_melt["province_state_id"] = pd.to_numeric(cases_melt["province_state_id"], downcast='integer')
            cases_melt["count"] = pd.to_numeric(cases_melt["count"], downcast='integer')
            cases_melt['case_type'] = i
            full_dataset_cleaned.append(cases_melt)
        self.full_dataset_cleaned_list = full_dataset_cleaned
        full_dataset_cleaned_combined = pd.concat(full_dataset_cleaned, ignore_index=True)
        full_dataset_cleaned_combined['province_state_id'] = full_dataset_cleaned_combined['province_state_id'].astype('Int64')
        full_dataset_cleaned_combined['country_id'] = full_dataset_cleaned_combined['country_id'].astype('Int64')
        full_dataset_cleaned_combined['count'] = full_dataset_cleaned_combined['count'].astype('Int64')
        self.full_dataset_cleaned_combined = full_dataset_cleaned_combined

In [75]:
# Start JHU data prep
# self = covid_data
covid_data = CovidDataset()

In [76]:
covid_data.full_dataset_raw[1]['new']

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/4/20,4/5/20,4/6/20,4/7/20,4/8/20,4/9/20,4/10/20,4/11/20,4/12/20,4/13/20
0,,Afghanistan,33.000000,65.000000,0,0,0,0,0,0,...,10,15,18,18,29,32,32,32,32,32
1,,Albania,41.153300,20.168300,0,0,0,0,0,0,...,99,104,116,131,154,165,182,197,217,232
2,,Algeria,28.033900,1.659600,0,0,0,0,0,0,...,90,90,90,113,237,347,405,460,591,601
3,,Andorra,42.506300,1.521800,0,0,0,0,0,0,...,21,26,31,39,52,58,71,71,128,128
4,,Angola,-11.202700,17.873900,0,0,0,0,0,0,...,2,2,2,2,2,2,2,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,Saint Pierre and Miquelon,France,46.885200,-56.315900,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
246,,South Sudan,6.877000,31.307000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
247,,Western Sahara,24.215500,-12.885800,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
248,,Sao Tome and Principe,0.186360,6.613081,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
new_ref_data_dict = covid_data.createNewReferenceData()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [78]:
new_ref_data_dict['country']

Unnamed: 0,name,lat,long,country_id,continent_id
0,Afghanistan,33.000000,65.000000,0,2
1,Albania,41.153300,20.168300,1,3
2,Algeria,28.033900,1.659600,2,0
3,Andorra,42.506300,1.521800,3,3
4,Angola,-11.202700,17.873900,4,0
...,...,...,...,...,...
174,Venezuela,6.423800,-66.589700,178,6
175,Vietnam,16.000000,108.000000,179,2
176,Yemen,15.552727,48.516388,182,2
177,Zambia,-15.416700,28.283300,183,0


In [79]:
covid_data.standardizeNewData()

In [80]:
covid_data.saveData()

In [256]:
covid_data.full_dataset_cleaned_combined

Unnamed: 0,country_id,province_state_id,date,count,case_type
0,0,,2020-01-22,0,0
1,1,,2020-01-22,0,0
2,2,,2020-01-22,0,0
3,3,,2020-01-22,0,0
4,4,,2020-01-22,0,0
...,...,...,...,...,...
57527,28,,2020-04-06,0,2
57528,149,,2020-04-06,0,2
57529,104,,2020-04-06,0,2
57530,155,,2020-04-06,0,2


In [12]:
country_continent = covid_data.reference_data_dict['country_continent']

In [18]:
country_continent.columns

Index(['Continent', 'Country/Region', 'continent_id', 'Unnamed: 4'], dtype='object')

In [19]:
country_continent = country_continent.drop('Unnamed: 4', axis=1)

In [20]:
country_continent.to_csv("data/reference/country_continent.csv")

In [73]:
old_ref_data_dict['province_state']

Unnamed: 0,Province/State
0,No province/state
1,British Columbia
2,New South Wales
3,Victoria
4,Queensland
...,...
319,New Caledonia
320,Bermuda
321,Sint Maarten
322,Isle of Man


In [68]:
for i, key in enumerate(new_ref_data_dict):
    print(new_ref_data_dict[key])
    print(old_ref_data_dict[key])
    print("________________________")
#     comparison = new_ref_data_dict[key]==old_ref_data_dict[key]
#     if not comparison.all():
#         print("Reference data changed")
#     print(i)

# timeseries_comparison = df.columns[4:]==pd.Index(['Province/State', 'Country/Region', 'Lat', 'Long'])
#         if not location_comparison.all():
#             print("Location columns have changed")

               0
0         Africa
1     Antarctica
2           Asia
3         Europe
4  North America
5        Oceania
6  South America
7           None
               0
0         Africa
1     Antarctica
2           Asia
3         Europe
4  North America
5        Oceania
6  South America
7           None
________________________
         Continent                Country  continent_id
0           Africa                Algeria             0
1           Africa                 Angola             0
2           Africa                  Benin             0
3           Africa               Botswana             0
4           Africa                Burkina             0
..             ...                    ...           ...
218         Europe                 Jersey             3
219  North America            Puerto Rico             4
220         Africa  Republic of the Congo             0
221  North America            The Bahamas             4
222         Africa             The Gambia            

In [64]:
for key in old_ref_data_dict:
    print(key)

continent
country_continent
country
province_state
types


In [30]:
print(province_state_df.columns)
print(country_df.columns)

Index(['Province/State', 'Country/Region', 'province_id'], dtype='object')
Index(['Country/Region', 'country_id'], dtype='object')


In [31]:
merged_df = pd.merge(province_state_df, country_df, how='inner', on='Country/Region', left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

In [32]:
merged_df

Unnamed: 0,Province/State,Country/Region,province_id,country_id
0,Australian Capital Territory,Australia,0,8
1,New South Wales,Australia,1,8
2,Northern Territory,Australia,2,8
3,Queensland,Australia,3,8
4,South Australia,Australia,4,8
...,...,...,...,...
74,Isle of Man,United Kingdom,70,154
75,Montserrat,United Kingdom,71,154
76,Anguilla,United Kingdom,76,154
77,British Virgin Islands,United Kingdom,77,154


In [113]:
data_by_country = df_github.groupby("Country/Region").mean()

In [114]:
comparison = df_github.columns[0:4]==pd.Index(['Province/State', 'Country/Region', 'Lat', 'Long'])
comparison.all()

True

In [49]:
provstate_country = pd.DataFrame(dataset.iloc[:, 0:4].drop_duplicates(keep='first').dropna().reset_index().drop(["index"],axis=1))

In [50]:
provstate_country.index[prov_state.iloc[:,0]=='Northwest Territories'].tolist()

[74]

In [51]:
provstate_country

Unnamed: 0,Province/State,Country/Region,Lat,Long
0,Australian Capital Territory,Australia,-35.4735,149.0124
1,New South Wales,Australia,-33.8688,151.2093
2,Northern Territory,Australia,-12.4634,130.8456
3,Queensland,Australia,-28.0167,153.4000
4,South Australia,Australia,-34.9285,138.6007
...,...,...,...,...
71,Montserrat,United Kingdom,16.7425,-62.1874
72,Diamond Princess,Canada,0.0000,0.0000
73,Recovered,Canada,0.0000,0.0000
74,Northwest Territories,Canada,64.8255,-124.8457


In [14]:
dataset_countries = dataset.iloc[:, 0:4]

In [15]:
dataset_countries

Unnamed: 0,Province/State,Country/Region,Lat,Long
0,,Afghanistan,33.000000,65.000000
1,,Albania,41.153300,20.168300
2,,Algeria,28.033900,1.659600
3,,Andorra,42.506300,1.521800
4,,Angola,-11.202700,17.873900
...,...,...,...,...
243,,Mali,17.570692,-3.996166
244,,Saint Kitts and Nevis,17.357822,-62.782998
245,Northwest Territories,Canada,64.825500,-124.845700
246,Yukon,Canada,64.282300,-135.000000


In [118]:
country_df = df_github_countries.iloc[:, 1:4].drop_duplicates(subset=['Country/Region'],keep='last',ignore_index=False)

In [120]:
country_df

Unnamed: 0,Country/Region,Lat,Long
0,Thailand,15.0000,101.0000
1,Japan,36.0000,138.0000
2,Singapore,1.2833,103.8333
3,Nepal,28.1667,84.2500
4,Malaysia,2.5000,112.5000
...,...,...,...
496,Jersey,49.1900,-2.1100
497,Puerto Rico,18.2000,-66.5000
498,Republic of the Congo,-1.4400,15.5560
499,The Bahamas,24.2500,-76.0000


In [119]:
len(country_df)

183

In [19]:
country_df.to_csv("data/reference/country.csv")

In [68]:
continents = ["Africa", "Antarctica", "Asia", "Europe", "North America", "Oceania", "South America", "None"]

In [69]:
continents_df = pd.DataFrame(continents)

In [83]:
continents_df

Unnamed: 0,0
0,Africa
1,Antarctica
2,Asia
3,Europe
4,North America
5,Oceania
6,South America
7,


In [85]:
continents_df.iloc[0,0]

'Africa'

In [263]:
continents_df.to_csv("data/reference/continent.csv")

In [4]:
df_github

NameError: name 'df_github' is not defined

In [205]:
prov_state = df_github.iloc[:, 0:2].drop_duplicates(keep='first').dropna().reset_index().drop(["index"],axis=1)

In [206]:
prov_state

Unnamed: 0,Province/State,Country/Region
0,British Columbia,Canada
1,New South Wales,Australia
2,Victoria,Australia
3,Queensland,Australia
4,South Australia,Australia
...,...,...
322,Sint Maarten,Netherlands
323,Isle of Man,United Kingdom
324,Northwest Territories,Canada
325,United States Virgin Islands,US


In [207]:
prov_state.to_csv("data/reference/province_state.csv")

In [123]:
import math
region_codes = []
for region in df_github_countries.iloc[:,0]:
    if isinstance(region, str):
        pass
    elif np.isnan(region):
        region_codes.append(0)
        continue
    for i, item in enumerate(prov_state.iloc[:,0]):
        if item == region:
            region_codes.append(i)

print(len(region_codes))

501


In [124]:
prov_state.iloc[:,0]

0                               NaN
5                  British Columbia
6                   New South Wales
7                          Victoria
8                        Queensland
                   ...             
472                    Sint Maarten
475                     Isle of Man
477           Northwest Territories
491    United States Virgin Islands
492                              US
Name: Province/State, Length: 326, dtype: object

In [125]:
df_github_countries["region_codes"] = region_codes

In [126]:
df_github_countries

Unnamed: 0,Province/State,Country/Region,Lat,Long,region_codes
0,,Thailand,15.0000,101.0000,0
1,,Japan,36.0000,138.0000,0
2,,Singapore,1.2833,103.8333,0
3,,Nepal,28.1667,84.2500,0
4,,Malaysia,2.5000,112.5000,0
...,...,...,...,...,...
496,,Jersey,49.1900,-2.1100,0
497,,Puerto Rico,18.2000,-66.5000,0
498,,Republic of the Congo,-1.4400,15.5560,0
499,,The Bahamas,24.2500,-76.0000,0


In [7]:
df_countries_continents = pd.read_csv(url_countries_csv)
df_countries = pd.read_csv(url_csv)

In [8]:
new_country_data = {
    "Continent": [
        "Europe",
        "North America",
        "Africa",
        "Europe",
        "None",
        "Europe",
        "Asia",
        "Asia",
        "Africa",
        "Africa",
        "Africa",
        "Europe",
        "Africa",
        "Africa",
        "North America",
        "Africa",
        "Asia",
        "North America",
        "Africa",
        "South America",
        "Africa",
        "North America",
        "Oceania",
        "Europe",
        "Europe",
        "North America",
        "Africa",
        "North America",
        "Africa",
                 ],
    "Country": [
        "North Macedonia",
        "Martinique",
        "Burkina Faso",
        "Holy See",
        "Cruise Ship",
        "Czechia",
        "Taiwan*",
        "Russia",
        "Congo (Kinshasa)",
        "Cote d'Ivoire",
        "Eswatini",
        "Kosovo",
        "Congo (Brazzaville)",
        "Gambia, The",
        "Bahamas, The",
        "Cabo Verde",
        "Timor-Leste",
        "Guadeloupe",
        "Reunion",
        "French Guiana",
        "Mayotte",
        "Greenland",
        "Guam",
        "Guernsey",
        "Jersey",
        "Puerto Rico",
        "Republic of the Congo",
        "The Bahamas",
        "The Gambia"
    ]
}
new_country_data_df = pd.DataFrame(new_country_data)

In [9]:
df_countries_continents=df_countries_continents.append(new_country_data_df, ignore_index=True)

In [10]:
df_countries_continents.to_csv("data/reference/countries_continents.csv")

In [76]:
df_countries_continents

Unnamed: 0,Continent,Country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina
...,...,...
218,Europe,Jersey
219,North America,Puerto Rico
220,Africa,Republic of the Congo
221,North America,The Bahamas


In [127]:
i=1
continent_id_list = []
for line in df_countries_continents.iterrows():
    for j, continent in enumerate(continents_df.iterrows()):
        if line[1]['Continent']==continent[1][0]:
#             print("MATCH")
            continent_id_list.append(j)

In [128]:
df_countries_continents['continent_id'] = continent_id_list

In [129]:
df_countries_continents

Unnamed: 0,Continent,Country,continent_id
0,Africa,Algeria,0
1,Africa,Angola,0
2,Africa,Benin,0
3,Africa,Botswana,0
4,Africa,Burkina,0
...,...,...,...
218,Europe,Jersey,3
219,North America,Puerto Rico,4
220,Africa,Republic of the Congo,0
221,North America,The Bahamas,4


In [121]:
country_df

Unnamed: 0,Country/Region,Lat,Long
0,Thailand,15.0000,101.0000
1,Japan,36.0000,138.0000
2,Singapore,1.2833,103.8333
3,Nepal,28.1667,84.2500
4,Malaysia,2.5000,112.5000
...,...,...,...
496,Jersey,49.1900,-2.1100
497,Puerto Rico,18.2000,-66.5000
498,Republic of the Congo,-1.4400,15.5560
499,The Bahamas,24.2500,-76.0000


In [165]:
country_df = country_df.reset_index()
country_df = country_df.drop(["index"],axis=1)
country_df

Unnamed: 0,Country/Region,Lat,Long
0,Thailand,15.0000,101.0000
1,Japan,36.0000,138.0000
2,Singapore,1.2833,103.8333
3,Nepal,28.1667,84.2500
4,Malaysia,2.5000,112.5000
...,...,...,...
178,Jersey,49.1900,-2.1100
179,Puerto Rico,18.2000,-66.5000
180,Republic of the Congo,-1.4400,15.5560
181,The Bahamas,24.2500,-76.0000


In [177]:
i=1
continent_id_list = []
for line in country_df.iterrows():
    for j, continent in enumerate(df_countries_continents.iterrows()):
        if line[1]['Country/Region']==continent[1]['Country']:
#             print("MATCH")
            continent_id_list.append(continent[1]['continent_id'])

In [178]:
df_countries_continents

Unnamed: 0,Continent,Country,continent_id
0,Africa,Algeria,0
1,Africa,Angola,0
2,Africa,Benin,0
3,Africa,Botswana,0
4,Africa,Burkina,0
...,...,...,...
218,Europe,Jersey,3
219,North America,Puerto Rico,4
220,Africa,Republic of the Congo,0
221,North America,The Bahamas,4


In [179]:
country_df['continent_id'] = continent_id_list

In [180]:
country_df

Unnamed: 0,Country/Region,Lat,Long,continent_id
0,Thailand,15.0000,101.0000,2
1,Japan,36.0000,138.0000,2
2,Singapore,1.2833,103.8333,2
3,Nepal,28.1667,84.2500,2
4,Malaysia,2.5000,112.5000,2
...,...,...,...,...
178,Jersey,49.1900,-2.1100,3
179,Puerto Rico,18.2000,-66.5000,4
180,Republic of the Congo,-1.4400,15.5560,0
181,The Bahamas,24.2500,-76.0000,4


In [181]:
country_df.to_csv("data/reference/country.csv")

In [174]:
for j, continent in enumerate(df_countries_continents.iterrows()):
    print(continent[1]['Country'])
    print(line[1]['Country/Region'])
    if line[1]['Country/Region']==continent[1]['Country']:
        print("MATCH")

Algeria
Thailand
Angola
Thailand
Benin
Thailand
Botswana
Thailand
Burkina
Thailand
Burundi
Thailand
Cameroon
Thailand
Cape Verde
Thailand
Central African Republic
Thailand
Chad
Thailand
Comoros
Thailand
Congo
Thailand
Congo, Democratic Republic of
Thailand
Djibouti
Thailand
Egypt
Thailand
Equatorial Guinea
Thailand
Eritrea
Thailand
Ethiopia
Thailand
Gabon
Thailand
Gambia
Thailand
Ghana
Thailand
Guinea
Thailand
Guinea-Bissau
Thailand
Ivory Coast
Thailand
Kenya
Thailand
Lesotho
Thailand
Liberia
Thailand
Libya
Thailand
Madagascar
Thailand
Malawi
Thailand
Mali
Thailand
Mauritania
Thailand
Mauritius
Thailand
Morocco
Thailand
Mozambique
Thailand
Namibia
Thailand
Niger
Thailand
Nigeria
Thailand
Rwanda
Thailand
Sao Tome and Principe
Thailand
Senegal
Thailand
Seychelles
Thailand
Sierra Leone
Thailand
Somalia
Thailand
South Africa
Thailand
South Sudan
Thailand
Sudan
Thailand
Swaziland
Thailand
Tanzania
Thailand
Togo
Thailand
Tunisia
Thailand
Uganda
Thailand
Zambia
Thailand
Zimbabwe
Thailand
Afgh

In [166]:
df_countries_continents

Unnamed: 0,Continent,Country,continent_id
0,Africa,Algeria,0
1,Africa,Angola,0
2,Africa,Benin,0
3,Africa,Botswana,0
4,Africa,Burkina,0
...,...,...,...
218,Europe,Jersey,3
219,North America,Puerto Rico,4
220,Africa,Republic of the Congo,0
221,North America,The Bahamas,4


In [104]:
l = 1
for line in df_countries_continents.iterrows():
    print(line[1]['Continent'])
    l+=1
    if l==3:
        break

Africa
Africa


In [131]:
continent_codes = []
for item in df_github_countries.iloc[:,1]:
    old_size = len(continent_codes)
    for country in df_countries_continents.iterrows():
        if item==country[1][1]:
            continent_codes.append(country[1][0])
            
    new_size = len(continent_codes)
    if old_size==new_size:
        continent_codes.append("NA")

In [132]:
df_github_countries['continent'] = continent_codes

In [133]:
for i, line in enumerate(df_github_countries.iterrows()):
    if line[1][5]=="NA":
        print(line)

In [134]:
df_github_countries

Unnamed: 0,Province/State,Country/Region,Lat,Long,region_codes,continent
0,,Thailand,15.0000,101.0000,0,Asia
1,,Japan,36.0000,138.0000,0,Asia
2,,Singapore,1.2833,103.8333,0,Asia
3,,Nepal,28.1667,84.2500,0,Asia
4,,Malaysia,2.5000,112.5000,0,Asia
...,...,...,...,...,...,...
496,,Jersey,49.1900,-2.1100,0,Europe
497,,Puerto Rico,18.2000,-66.5000,0,North America
498,,Republic of the Congo,-1.4400,15.5560,0,Africa
499,,The Bahamas,24.2500,-76.0000,0,North America


In [135]:
import math
continent_codes = []
for continent in df_github_countries.iloc[:,-1]:
    if isinstance(continent, str):
        pass
    elif np.isnan(continent):
        print("WHAT")
    for i, item in enumerate(continents_df.iloc[:,0]):
        if item == continent:
            continent_codes.append(i)

print(len(continent_codes))

501


In [137]:
continents_df_csv = pd.read_csv("data/reference/continent.csv", index_col=0)
continents_df_csv

Unnamed: 0,0
0,Africa
1,Antarctica
2,Asia
3,Europe
4,North America
5,Oceania
6,South America
7,


In [136]:
df_github_countries['continent_codes'] = continent_codes

In [160]:
df_github_countries

Unnamed: 0,Province/State,Country/Region,Lat,Long,region_codes,continent,continent_codes
0,,Thailand,15.0000,101.0000,0,Asia,2
1,,Japan,36.0000,138.0000,0,Asia,2
2,,Singapore,1.2833,103.8333,0,Asia,2
3,,Nepal,28.1667,84.2500,0,Asia,2
4,,Malaysia,2.5000,112.5000,0,Asia,2
...,...,...,...,...,...,...,...
496,,Jersey,49.1900,-2.1100,0,Europe,3
497,,Puerto Rico,18.2000,-66.5000,0,North America,4
498,,Republic of the Congo,-1.4400,15.5560,0,Africa,0
499,,The Bahamas,24.2500,-76.0000,0,North America,4


In [138]:
cases = df_github_countries.drop("continent", axis=1).drop("Lat", axis=1).drop("Long", axis=1).drop("Province/State", axis=1)

In [139]:
cases

Unnamed: 0,Country/Region,region_codes,continent_codes
0,Thailand,0,2
1,Japan,0,2
2,Singapore,0,2
3,Nepal,0,2
4,Malaysia,0,2
...,...,...,...
496,Jersey,0,3
497,Puerto Rico,0,4
498,Republic of the Congo,0,0
499,The Bahamas,0,4


In [163]:
df_github_countries_nodup = df_github_countries.loc[:].drop_duplicates(keep='first')

In [164]:
df_github_countries_nodup

Unnamed: 0,Province/State,Country/Region,Lat,Long,region_codes,continent,continent_codes
0,,Thailand,15.0000,101.0000,0,Asia,2
1,,Japan,36.0000,138.0000,0,Asia,2
2,,Singapore,1.2833,103.8333,0,Asia,2
3,,Nepal,28.1667,84.2500,0,Asia,2
4,,Malaysia,2.5000,112.5000,0,Asia,2
...,...,...,...,...,...,...,...
496,,Jersey,49.1900,-2.1100,0,Europe,3
497,,Puerto Rico,18.2000,-66.5000,0,North America,4
498,,Republic of the Congo,-1.4400,15.5560,0,Africa,0
499,,The Bahamas,24.2500,-76.0000,0,North America,4


In [289]:
import math
country_codes = []
country_df = pd.read_csv("data/reference/country.csv", index_col=0)

for case in df_github_countries.iloc[:,1]:
    for i, item in enumerate(country_df.iloc[:,0]):
        if item == case:
            country_codes.append(i)
            
print(len(country_codes))

501


In [290]:
country_df

Unnamed: 0,Country/Region,Lat,Long
0,Thailand,15.0000,101.0000
1,Japan,36.0000,138.0000
2,Singapore,1.2833,103.8333
3,Nepal,28.1667,84.2500
4,Malaysia,2.5000,112.5000
...,...,...,...
496,Jersey,49.1900,-2.1100
497,Puerto Rico,18.2000,-66.5000
498,Republic of the Congo,-1.4400,15.5560
499,The Bahamas,24.2500,-76.0000


In [291]:
cases['country_code'] = country_codes

In [292]:
cases = cases.drop('Country/Region', axis=1)

In [293]:
cases

Unnamed: 0,region_codes,continent_codes,country_code
0,322,2,0
1,322,2,1
2,322,2,2
3,322,2,3
4,322,2,4
...,...,...,...
496,322,3,178
497,322,4,179
498,322,0,180
499,322,4,181


In [294]:
cases_full = pd.concat([cases, df_github.iloc[:,4:]], axis=1)

In [295]:
cases_full

Unnamed: 0,region_codes,continent_codes,country_code,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20
0,322,2,0,2,3,5,7,8,8,14,...,82,114,147,177,212,272,322,411,599,599.0
1,322,2,1,2,1,2,2,4,4,7,...,773,839,825,878,889,924,963,1007,1086,1086.0
2,322,2,2,0,1,3,3,4,5,7,...,212,226,243,266,313,345,385,432,455,455.0
3,322,2,3,0,0,0,1,1,1,1,...,1,1,1,1,1,1,1,1,2,2.0
4,322,2,4,0,0,0,3,4,4,4,...,238,428,566,673,790,900,1030,1183,1306,1306.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,322,3,178,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
497,322,4,179,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
498,322,0,180,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
499,322,4,181,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [296]:
cases_full_melt = pd.melt(cases_full, id_vars=['region_codes', 'continent_codes', 'country_code'], value_vars=['1/22/20'])

In [297]:
cases_full_melt

Unnamed: 0,region_codes,continent_codes,country_code,variable,value
0,322,2,0,1/22/20,2
1,322,2,1,1/22/20,2
2,322,2,2,1/22/20,0
3,322,2,3,1/22/20,0
4,322,2,4,1/22/20,0
...,...,...,...,...,...
496,322,3,178,1/22/20,0
497,322,4,179,1/22/20,0
498,322,0,180,1/22/20,0
499,322,4,181,1/22/20,0


In [298]:
cases_full.shape

(501, 65)

In [299]:
for i, column in enumerate(cases_full.iloc[:,4:]):
    melted_df = pd.melt(cases_full, id_vars=['region_codes', 'continent_codes', 'country_code'], value_vars=[column])
    cases_full_melt = cases_full_melt.append(melted_df)

In [300]:
cases_full_melt.shape

(31062, 5)

In [92]:
cases_full_melt = cases_full_melt.reset_index()

In [93]:
cases_full_melt = cases_full_melt.drop("index", axis=1)

In [95]:
cases_full_melt.head()

Unnamed: 0,region_codes,continent_codes,country_code,variable,value
0,322,2,0,1/22/20,2.0
1,322,2,1,1/22/20,2.0
2,322,2,2,1/22/20,0.0
3,322,2,3,1/22/20,0.0
4,322,2,4,1/22/20,0.0


In [96]:
cases_full_melt['variable'] = pd.to_datetime(cases_full_melt['variable'],infer_datetime_format=True)

In [98]:
cases_full_melt.head()

Unnamed: 0,region_codes,continent_codes,country_code,variable,value
0,322,2,0,2020-01-22,2.0
1,322,2,1,2020-01-22,2.0
2,322,2,2,2020-01-22,0.0
3,322,2,3,2020-01-22,0.0
4,322,2,4,2020-01-22,0.0


In [99]:
cases_full_melt.to_csv('data/timeseries/cases_confirmed.csv')

In [1]:
sources = {
    "confirmed": {
        "url": 
    },
    
}

In [2]:
sources['type']

'confirmed'

In [27]:
sources = {
    "confirmed": {
        "url": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv",
        "local_path": "data/cases_confirmed.csv"
    },
    "recovered": {
        "url": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv",
        "local_path": "data/cases_recovered.csv"    
    },
    "deaths": {
        "url": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv",
        "local_path": "data/cases_deaths.csv"
    },
}

In [38]:
for key, value in sources.items():
    print(value['url'])

https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv
https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv
https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv


In [11]:
sources['confirmed']['url']

'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv'

In [15]:
import pandas as pd

In [22]:
def checkDataChange(data_type):
    old_data = pd.read_csv(sources[data_type]['local_path'])
    old_size = old_data.size
    
    new_data = pd.read_csv(sources[data_type]['url'])
    new_size = new_data.size
    if old_size!=new_size:
        print("Data size mismatch: Old: ",  str(old_size), "; New: ", str(new_size))

In [23]:
checkDataChange('confirmed')

Data size mismatch: Old:  178242 ; New:  33066


In [3]:
class CovidDataset:
    def __init__(self):
        self.sources = {
            "confirmed": {
                "url": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv",
                "local_path": "data/timeseries/cases_confirmed.csv"
            },
            "recovered": {
                "url": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv",
                "local_path": "data/timeseries/cases_confirmed.csv"    
            },
            "deaths": {
                "url": "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv",
                "local_path": "data/timeseries/cases_confirmed.csv"
            },
        }
        self.confirmed_data_raw = {
            'new': self.getNewData('confirmed'),
            'old': self.getOldData('confirmed')
            }
        self.recovered_data_raw = {
            'new': self.getNewData('recovered'),
            'old': self.getOldData('recovered')
            }
        self.deaths_data_raw = {
            'new': self.getNewData('deaths'),
            'old': self.getOldData('deaths')
            }
        self.full_dataset_raw = [self.confirmed_data_raw, self.recovered_data_raw, self.deaths_data_raw]

        self.old_reference_data = self.loadOldReferenceData()
        self.needs_reference_data_refresh = False
        self.needs_timeseries_data_refresh = False


    def getNewData(self, data_type):
        """Input url of csv, returns dataframe"""
        df = pd.read_csv(self.sources[str(data_type)]['url'])
        return df

    def getOldData(self, data_type):
        """Input filepath of csv, returns dataframe"""
        df = pd.read_csv(self.sources[str(data_type)]['local_path'])
        return df

    def createNewReferenceData(self):
        # Continents reference data
        continents = ["Africa", "Antarctica", "Asia", "Europe", "North America", "Oceania", "South America", "None"]
        continents_df = pd.DataFrame(continents)
        # Types reference data
        types_df = pd.DataFrame(["Confirmed", "Recovered", "Deaths"])
        # Province/State reference data
        prov_state_list = []
        for dataset in self.full_dataset_raw:
            prov_state = pd.DataFrame(dataset['new'].iloc[:, 0:2].drop_duplicates(keep='first').dropna().reset_index().drop(["index"],axis=1))
            prov_state_list.append(prov_state)
        dfps = pd.concat([prov_state_list[0], prov_state_list[1], prov_state_list[2]], ignore_index=True)
        province_state_df = dfps.drop_duplicates(keep='first')
        province_state_df.iloc[0,0] = "None"
        # Countries-continents reference data
        base_path = "data/reference/"
        countries_continents_path = base_path + "countries_continents.csv"
        countries_continents_df=pd.read_csv(countries_continents_path, index_col=0)
        # Country reference data
        base_path = "data/reference/"
        country_path = base_path + "country.csv"
        country_df=pd.read_csv(country_path, index_col=0)
        # Combine all reference data into a list
        reference_data_list = [continents_df, country_df, province_state_df, countries_continents_df, types_df]
        return reference_data_list
    
    def loadOldReferenceData(self):
        base_path = "data/reference/"
        continent_path = base_path + "continent.csv"
        country_path = base_path + "country.csv"
        province_state_path = base_path + "province_state.csv"
        countries_continents_path = base_path + "countries_continents.csv"
        types_path = base_path + "types.csv"
        reference_data_paths = [continent_path, country_path, province_state_path, countries_continents_path, types_path]
        reference_data_list = []
        for path in reference_data_paths:
            df = pd.read_csv(path, index_col=0)
            reference_data_list.append(df)
        return reference_data_list

    def standardizeNewData(self):
        full_dataset_cleaned = []
        for i, dataset in enumerate(self.full_dataset_raw):
            countries = dataset['new'].iloc[:,:4]
            timeseries = dataset['new'].iloc[:,4:]
#             print(i)
            # Create list of region codes present in the dataset and add column
            region_codes = []
            prov_state = countries.iloc[:, 0].drop_duplicates(keep='first').reset_index().drop(["index"],axis=1)
            for region in countries.iloc[:,0]:
                if isinstance(region, str):
                    pass
                elif np.isnan(region):
                    region_codes.append(0)
                    continue
                for i, item in enumerate(prov_state.iloc[:,0]):
                    if item == region:
                        region_codes.append(i)
            countries["region_codes"] = region_codes

            # Create list of continent codes and add column
            continents = []
            df_countries_continents = pd.read_csv("data/reference/countries_continents.csv", index_col=0)
            for item in countries.iloc[:,1]:
                old_size = len(continents)
                for country in df_countries_continents.iterrows():
                    if item==country[1][1]:
                        continents.append(country[1][0])
            
                new_size = len(continents)
                if old_size==new_size:
                    continents.append("NA")
            for i, line in enumerate(countries.iterrows()):
                if line[1][-1]=="NA":
                    print(line)
            countries["continent"] = continents
            
            continent_codes = []
            continents_df = pd.read_csv("data/reference/continent.csv", index_col=0)
            for continent in countries.iloc[:,-1]:
                if isinstance(continent, str):
                    pass
                elif np.isnan(continent):
                    print("Not a continent")
                for i, item in enumerate(continents_df.iloc[:,0]):
                    if item == continent:
                        continent_codes.append(i)
            countries["continent_codes"] = continent_codes

            # Create and append list of country codes
            country_codes = []
            country_df = pd.read_csv("data/reference/country.csv", index_col=0)
            for case in countries.iloc[:,1]:
                for i, item in enumerate(country_df.iloc[:,0]):
                    if item == case:
                        country_codes.append(i)
            countries['country_code'] = country_codes
            
            # Concatenate country and timeseries dataframes       
            cases = countries.drop('Country/Region', axis=1).drop("continent", axis=1).drop("Lat", axis=1).drop("Long", axis=1).drop("Province/State", axis=1)
            cases_full = pd.concat([cases, timeseries], axis=1)
            cases_full_melt = pd.melt(cases_full, id_vars=['region_codes', 'continent_codes', 'country_code'], value_vars=['1/22/20'])
            
            # Join all melted time series columns
            for i, column in enumerate(cases_full.iloc[:,4:]):
                melted_df = pd.melt(cases_full, id_vars=['region_codes', 'continent_codes', 'country_code'], value_vars=[column])
                cases_full_melt = cases_full_melt.append(melted_df)
            cases_full_melt['variable'] = pd.to_datetime(cases_full_melt['variable'],infer_datetime_format=True)
            cases_full_melt.columns = ['region_code', 'continent_code', 'country_code', 'date', 'count']
            cases_full_melt['case_type'] = i
            full_dataset_cleaned.append(cases_full_melt)
        self.full_dataset_cleaned = full_dataset_cleaned
    
    def combineCleansedData(self):
        full_dataset_combined = pd.DataFrame()
        case_types = ["Confirmed", "Recovered", "Deaths"]
        case_types_df = pd.DataFrame(case_types)
        for i, dataset in enumerate(self.full_dataset_cleaned):
            dataset['case_type'] = i
            full_dataset_combined = full_dataset_combined.append(dataset, ignore_index=True)
        self.full_dataset_combined = full_dataset_combined
    
    def saveData(self):
        filenames = ["confirmed", "recovered", "deaths"]
        base_dir_split = "data/timeseries/daily_split/"
        base_dir_combined = "data/timeseries/daily_combined/"
        for i, item in enumerate(self.full_dataset_cleaned):
            filename = base_dir_split + str(date.today()) + str(filenames[i]) + ".csv" 
            item.to_csv(filename)
        filename_combined = base_dir_combined + str(date.today()) + "-combined.csv" 
        self.full_dataset_combined.to_csv(filename_combined)

In [4]:
cases = CovidDataset()

In [5]:
ref = cases.old_reference_data

In [7]:
ref

[               0
 0         Africa
 1     Antarctica
 2           Asia
 3         Europe
 4  North America
 5        Oceania
 6  South America
 7           None,
             Country/Region      Lat      Long  continent_id
 0                 Thailand  15.0000  101.0000             2
 1                    Japan  36.0000  138.0000             2
 2                Singapore   1.2833  103.8333             2
 3                    Nepal  28.1667   84.2500             2
 4                 Malaysia   2.5000  112.5000             2
 ..                     ...      ...       ...           ...
 178                 Jersey  49.1900   -2.1100             3
 179            Puerto Rico  18.2000  -66.5000             4
 180  Republic of the Congo  -1.4400   15.5560             0
 181            The Bahamas  24.2500  -76.0000             4
 182             The Gambia  13.4667  -16.6000             0
 
 [183 rows x 4 columns],
                    Province/State  Country/Region
 0                British C

In [232]:
cases.full_dataset_raw

[{'new':     Province/State         Country/Region        Lat       Long  1/22/20  \
  0              NaN            Afghanistan  33.000000  65.000000        0   
  1              NaN                Albania  41.153300  20.168300        0   
  2              NaN                Algeria  28.033900   1.659600        0   
  3              NaN                Andorra  42.506300   1.521800        0   
  4              NaN                 Angola -11.202700  17.873900        0   
  ..             ...                    ...        ...        ...      ...   
  240            NaN                  Libya  26.335100  17.228331        0   
  241            NaN     West Bank and Gaza  31.952200  35.233200        0   
  242            NaN          Guinea-Bissau  11.803700 -15.180400        0   
  243            NaN                   Mali  17.570692  -3.996166        0   
  244            NaN  Saint Kitts and Nevis  17.357822 -62.782998        0   
  
       1/23/20  1/24/20  1/25/20  1/26/20  1/27/20  ..

In [45]:
prov_state_list = []
for dataset in cases.full_dataset_raw:
    prov_state = pd.DataFrame(dataset['new'].iloc[:, 0].drop_duplicates(keep='first'))
    prov_state_list.append(prov_state)

In [50]:
dfps = pd.concat([prov_state_list[0], prov_state_list[1], prov_state_list[2]], ignore_index=True)

In [63]:
dfps = dfps.drop_duplicates(keep='first')
value = dfps.iloc[0,0]

In [64]:
type(value)

float

In [65]:
dfps.loc[dfps['Province/State'] == np.nan]

Unnamed: 0,Province/State


In [234]:
### NEED TO FIX THIS
cases.standardizeNewData()

ValueError: Length of values does not match length of index

In [418]:
cases.combineCleansedData()

In [419]:
cases.full_dataset_combined

Unnamed: 0,region_code,continent_code,country_code,date,count,case_type
0,0,2,0,2020-01-22,2.0,0
1,0,2,1,2020-01-22,2.0,0
2,0,2,2,2020-01-22,0.0,0
3,0,2,3,2020-01-22,0.0,0
4,0,2,4,2020-01-22,0.0,0
...,...,...,...,...,...,...
93181,0,3,178,2020-03-23,0.0,2
93182,0,4,179,2020-03-23,1.0,2
93183,0,0,180,2020-03-23,0.0,2
93184,0,4,181,2020-03-23,0.0,2


In [420]:
cases.saveData()

In [396]:
cases.full_dataset_cleaned[0]['case_type'] = 0

In [397]:
cases.full_dataset_cleaned[0]

Unnamed: 0,region_code,continent_code,country_code,date,count,case_type
0,0,2,0,2020-01-22,2.0,0
1,0,2,1,2020-01-22,2.0,0
2,0,2,2,2020-01-22,0.0,0
3,0,2,3,2020-01-22,0.0,0
4,0,2,4,2020-01-22,0.0,0
...,...,...,...,...,...,...
496,0,3,178,2020-03-23,0.0,0
497,0,4,179,2020-03-23,0.0,0
498,0,0,180,2020-03-23,0.0,0
499,0,4,181,2020-03-23,0.0,0


In [17]:
d = {'types': ["Confirmed", "Recovered", "Deaths"], 'type_id': [0,1,2]}
types_df = pd.DataFrame(data=d)
types_df.to_csv("data/reference/types.csv")

In [390]:
np.issubdtype(cases.full_dataset_cleaned[2]['count'].dtype, np.number)

True