# Data Understanding

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 200)
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

from fuzzywuzzy import fuzz, process

## Import Data

### Corona

In [86]:
df = pd.read_csv("https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_daily_reports/03-11-2020.csv")

In [2]:
confirmed = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
deaths = pd.read_csv("https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")
recovered = pd.read_csv("https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv")

In [3]:
df.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude
0,Hubei,China,2020-03-11T10:53:02,67773,3046,49134,30.9756,112.2707
1,,Italy,2020-03-11T21:33:02,12462,827,1045,43.0,12.0
2,,Iran,2020-03-11T18:52:03,9000,354,2959,32.0,53.0
3,,"Korea, South",2020-03-11T21:13:18,7755,60,288,36.0,128.0
4,France,France,2020-03-11T22:53:03,2281,48,12,46.2276,2.2137


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 8 columns):
Province/State    109 non-null object
Country/Region    216 non-null object
Last Update       216 non-null object
Confirmed         216 non-null int64
Deaths            216 non-null int64
Recovered         216 non-null int64
Latitude          216 non-null float64
Longitude         216 non-null float64
dtypes: float64(2), int64(3), object(3)
memory usage: 13.6+ KB


In [103]:
country_dict = {'Mainland China': 'China',
#                 'Korea, South': 'Korea, Rep.',
                'US':'United States',
                'Taiwan*': "Taiwan",
                "Bosnia and Herzegovina":"Bosnia-Herzegovina"
#                 "Iran": "Iran, Islamic Rep.",
#                 "Russia": "Russian Federation",
#                 "Egypt": "Egypt, Arab Rep.",
#                 "Brunei":"Brunei Darussalam",
#                 "Czechia":"Czech Republic",
#                 "Slovakia": "Slovak Republic"
               }
# Replacing "Mainland China" with "China" 
df['Country/Region'].replace(country_dict, inplace=True)

### Corona Time Series

In [198]:
ts_confirmed = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
ts_deaths = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")
ts_recovered = pd.read_csv("https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv")

In [192]:
ts.to_excel("data/time-series/"+file+".xlsx")

### Population

In [10]:
pop = pd.read_csv("data/pop/population.csv", skiprows=4).iloc[:, [0,1, -3]]
pop = pop.rename(columns={'2018':'population', "Country Name":"Country"})

In [11]:
pop.head()

Unnamed: 0,Country,Country Code,population
0,Aruba,ABW,105845.0
1,Afghanistan,AFG,37172386.0
2,Angola,AGO,30809762.0
3,Albania,ALB,2866376.0
4,Andorra,AND,77006.0


In [12]:
pop[pop.Country.str.contains("United", case=False)]

Unnamed: 0,Country,Country Code,population
6,United Arab Emirates,ARE,9630959.0
79,United Kingdom,GBR,66488991.0
249,United States,USA,327167434.0


In [20]:
pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 3 columns):
Country         264 non-null object
Country Code    264 non-null object
population      262 non-null float64
dtypes: float64(1), object(2)
memory usage: 6.3+ KB


### Population 2019

In [83]:
pop = pd.read_csv("data/pop/prb_pop.csv", skiprows=4)
pop.head(10)

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data
0,WORLD,WORLD,World,2019,7691.463
1,AFRICA,AFRICA,Sub-Region,2019,1305.215
2,NORTHERN AFRICA,NORTHERN AFRICA,Sub-Region,2019,239.895
3,DZ,Algeria,Country,2019,43.406
4,EG,Egypt,Country,2019,99.064
5,LY,Libya,Country,2019,6.777
6,MA,Morocco,Country,2019,35.587
7,SD,Sudan,Country,2019,42.813
8,TN,Tunisia,Country,2019,11.665
9,EH,Western Sahara,Country,2019,0.582


### Country names

In [16]:
country = pd.read_csv("data/pop/WDICountry.csv").iloc[:,:10]

In [17]:
country.sample(5)

Unnamed: 0,Country Code,Short Name,Table Name,Long Name,2-alpha code,Currency Unit,Special Notes,Region,Income Group,WB-2 code
250,VCT,St. Vincent and the Grenadines,St. Vincent and the Grenadines,St. Vincent and the Grenadines,VC,East Caribbean dollar,,Latin America & Caribbean,Upper middle income,VC
7,ARG,Argentina,Argentina,Argentine Republic,AR,Argentine peso,,Latin America & Caribbean,Upper middle income,AR
56,DNK,Denmark,Denmark,Kingdom of Denmark,DK,Danish krone,,Europe & Central Asia,High income,DK
71,EUU,European Union,European Union,European Union,EU,,European Union aggregate.,,,EU
22,BIH,Bosnia and Herzegovina,Bosnia and Herzegovina,Bosnia and Herzegovina,BA,Bosnia and Herzegovina convertible mark,,Europe & Central Asia,Upper middle income,BA


In [18]:
country.shape

(263, 10)

## Match

In [28]:
%%time

def match_country(query, choices, scorer=fuzz.QRatio, score_cutoff=80):

    match = process.extractOne(query=query, choices=choices, scorer=scorer, score_cutoff=score_cutoff)
    if match:
        return query, match[0], match[1], match[2]
    else:
        return (np.nan, np.nan, np.nan, np.nan)

    
result = df['Country/Region'].drop_duplicates().apply(match_country, choices=pop.Country,  
                                              scorer=fuzz.QRatio, score_cutoff=0)

result = pd.DataFrame(result.tolist(), columns=['country', 'matched_country', 'score', 'key'])

CPU times: user 299 ms, sys: 4.35 ms, total: 303 ms
Wall time: 323 ms


In [27]:
result

Unnamed: 0,country,matched_country,score,key
0,China,China,100,38
1,Italy,Italy,100,114
2,Iran,Iraq,75,111
3,"Korea, South","Korea, Rep.",64,124
4,France,France,100,75
5,Spain,Spain,100,68
6,Germany,Germany,100,53
7,Cruise Ship,Cyprus,47,51
8,Switzerland,Switzerland,100,35
9,Japan,Japan,100,117


In [134]:
def merge_match(df, result, fonds_column="fonds_clean",ISIN_column="ISIN", result_isin="ISIN", result_fonds="fonds"):
    '''Fill some missing ISIN values with matching the fund names.'''

    # Create a dictionary from current fond-ISIN pairs
    match_dict = dict(zip(result[result_fonds], result[result_isin]))

    # Fill ISIN column with mapped values
    df[ISIN_column] = df[ISIN_column].fillna(df[fonds_column].map(match_dict))
    
    return df

# Merge the previously matched result with the df
df = preprocessing.merge_match(df, result[result.score>70])

NameError: name 'preprocessing' is not defined

## Merge

In [72]:
merged = df.merge(pop, how="left", left_on="Country/Region", right_on="Country")

In [104]:
merged = df.merge(pop, how="left", left_on="Country/Region", right_on="Name")

In [105]:
merged.sample(5)

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,FIPS,Name,Type,TimeFrame,Data
132,Kentucky,United States,2020-03-11T03:13:08,8,0,0,37.6681,-84.6701,US,United States,Country,2019.0,329.153
116,,Latvia,2020-03-11T13:53:24,10,0,1,56.8796,24.6032,LV,Latvia,Country,2019.0,1.913
100,Qinghai,China,2020-03-11T02:18:14,18,0,18,35.7452,95.9956,CN,China,Country,2019.0,1398.03
66,,Finland,2020-03-11T13:33:10,59,0,1,64.0,26.0,FI,Finland,Country,2019.0,5.521
198,Delaware,United States,2020-03-11T23:13:06,1,0,0,39.3185,-75.5071,US,United States,Country,2019.0,329.153


In [106]:
merged.isna().sum()

Province/State    107
Country/Region      0
Last Update         0
Confirmed           0
Deaths              0
Recovered           0
Latitude            0
Longitude           0
FIPS                5
Name                5
Type                5
TimeFrame           5
Data                5
dtype: int64

In [107]:
merged[merged.Name.isna()]

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,FIPS,Name,Type,TimeFrame,Data
14,Diamond Princess,Cruise Ship,2020-03-11T22:13:12,696,7,325,35.4437,139.638,,,,,
135,,Bosnia and Herzegovina,2020-03-11T22:13:12,7,0,0,43.9159,17.6791,,,,,
184,,Congo (Kinshasa),2020-03-11T21:13:24,1,0,0,-4.0383,21.7587,,,,,
187,,Holy See,2020-03-10T19:13:21,1,0,0,41.9029,12.4534,,,,,
215,,occupied Palestinian territory,2020-03-11T20:53:02,0,0,0,31.9522,35.2332,,,,,


In [108]:
merged[merged.Country.isna()]

AttributeError: 'DataFrame' object has no attribute 'Country'

In [114]:
country[country['Special Notes'].str.contains("Palestine", case=False)]

ValueError: cannot index with vector containing NA / NaN values

In [116]:
pop[pop.Name.str.contains("Bosnia", case=False)]

Unnamed: 0,FIPS,Name,Type,TimeFrame,Data
203,BA,Bosnia-Herzegovina,Country,2019,3.493


In [38]:
pop[pop["Country Code"].str.contains("NC", case=False)]

Unnamed: 0,Country,Country Code,population
170,New Caledonia,NCL,284060.0


## Analysis

In [60]:
agg = merged.groupby("Country/Region", as_index=False)[['Confirmed', 'Deaths','Recovered']].sum()

In [61]:
agg = agg.merge(pop, how="left", left_on="Country/Region", right_on="Country")

In [62]:
agg.population = agg.population / 1000000

In [63]:
agg['Active'] = agg.Confirmed - (agg.Deaths + agg.Recovered)

In [64]:
agg['Confirmed_per_Cap'] = agg.Confirmed/ agg.population
agg['Deaths_per_Cap'] = agg.Deaths/ agg.population
agg['Recovered_per_Cap'] = agg.Recovered/ agg.population
agg['Active_per_Cap'] = agg.Active/ agg.population

In [65]:
agg.head()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Country,Country Code,population,Active,Confirmed_per_Cap,Deaths_per_Cap,Recovered_per_Cap,Active_per_Cap
0,Afghanistan,7,0,0,Afghanistan,AFG,37.172386,7,0.188312,0.0,0.0,0.188312
1,Albania,12,1,0,Albania,ALB,2.866376,11,4.186471,0.348873,0.0,3.837598
2,Algeria,20,0,0,Algeria,DZA,42.228429,20,0.473615,0.0,0.0,0.473615
3,Andorra,1,0,0,Andorra,AND,0.077006,1,12.986001,0.0,0.0,12.986001
4,Argentina,19,1,0,Argentina,ARG,44.494502,18,0.427019,0.022475,0.0,0.404544


In [66]:
agg.head()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Country,Country Code,population,Active,Confirmed_per_Cap,Deaths_per_Cap,Recovered_per_Cap,Active_per_Cap
0,Afghanistan,7,0,0,Afghanistan,AFG,37.172386,7,0.188312,0.0,0.0,0.188312
1,Albania,12,1,0,Albania,ALB,2.866376,11,4.186471,0.348873,0.0,3.837598
2,Algeria,20,0,0,Algeria,DZA,42.228429,20,0.473615,0.0,0.0,0.473615
3,Andorra,1,0,0,Andorra,AND,0.077006,1,12.986001,0.0,0.0,12.986001
4,Argentina,19,1,0,Argentina,ARG,44.494502,18,0.427019,0.022475,0.0,0.404544


In [67]:
sort = agg.sort_values('Active_per_Cap', ascending=False).iloc[:20,:]

In [68]:
agg.to_excel("corona with population.xlsx")

In [222]:
agg[agg['Country/Region'].str.contains(r"San Marino", regex=True)]

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Country,Country Code,population,Active,Confirmed_per_Cap,Deaths_per_Cap,Recovered_per_Cap,Active_per_Cap
92,San Marino,62,2,0,San Marino,SMR,33.785,60,1.835134,0.059198,0.0,1.775936
