In [79]:
covid_url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/"
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import json
import urllib
import pandas as pd
import numpy as np
import plotly.express as px

In [47]:
covid_json_unformated = urllib.request.urlopen(covid_url).read().decode("utf-8")
covid_json = json.loads(covid_json_unformated)
cdf = pd.DataFrame(covid_json['records'])

In [71]:
cdf.head(10)

Unnamed: 0,year_week,cases_weekly,deaths_weekly,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,14d-incidence,date_reported,deltaTime_since_start_of_recording
0,2021-04,267,16,Afghanistan,AF,AFG,38041757.0,Asia,2.58,2021-01-02,362 days
1,2021-03,713,43,Afghanistan,AF,AFG,38041757.0,Asia,3.34,2021-01-25,385 days
2,2021-02,557,45,Afghanistan,AF,AFG,38041757.0,Asia,3.24,2021-01-18,378 days
3,2021-01,675,71,Afghanistan,AF,AFG,38041757.0,Asia,4.15,2021-11-01,665 days
4,2020-53,902,60,Afghanistan,AF,AFG,38041757.0,Asia,7.61,2021-04-01,451 days
5,2020-52,1994,88,Afghanistan,AF,AFG,38041757.0,Asia,7.19,2020-12-28,357 days
6,2020-51,740,111,Afghanistan,AF,AFG,38041757.0,Asia,6.56,2020-12-21,350 days
7,2020-50,1757,71,Afghanistan,AF,AFG,38041757.0,Asia,9.01,2020-12-14,343 days
8,2020-49,1672,137,Afghanistan,AF,AFG,38041757.0,Asia,7.22,2020-07-12,188 days
9,2020-48,1073,68,Afghanistan,AF,AFG,38041757.0,Asia,6.42,2020-11-30,329 days


Rename columns to something more Pythonian. If you think they look already great, then at least rename notification_rate_per_100000_population_14-days to 14d-incidence

Identify which columns have not been casted to an appropriate type during loading!

We did not cover datetime objects in pandas, however they are quite powerful!

Try:

In [49]:
cdf.rename(
    columns={"notification_rate_per_100000_population_14-days": "14d-incidence"},
    inplace=True
)

In [50]:
cdf['date_reported'] = pd.to_datetime(cdf['dateRep'])
cdf.drop(columns = "dateRep", inplace = True)

Unnamed: 0,year_week,cases_weekly,deaths_weekly,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,14d-incidence,date_reported
0,2021-04,267,16,Afghanistan,AF,AFG,38041757.0,Asia,2.58,2021-01-02
1,2021-03,713,43,Afghanistan,AF,AFG,38041757.0,Asia,3.34,2021-01-25
2,2021-02,557,45,Afghanistan,AF,AFG,38041757.0,Asia,3.24,2021-01-18
3,2021-01,675,71,Afghanistan,AF,AFG,38041757.0,Asia,4.15,2021-11-01
4,2020-53,902,60,Afghanistan,AF,AFG,38041757.0,Asia,7.61,2021-04-01


Now you can treat the column as a datetime objects using df[col].dt , e.g. https://docs.python.org/3/library/datetime.html#datetime.date.year

In [14]:
cdf['date_reported'].dt.day.head()

0     2
1    25
2    18
3     1
4     1
Name: date_reported, dtype: int64

Create a new column deltaTime_since_start_of_recording

Create histograms for different columns or describe the df. Can you spot the inconsistency in the data? Fix it! :)

Identify those countries (grouped by continent) which showed the most drastic increase most drastic and decrease of the 14d-incidence within the different years since recording. Visualize intuitively!

Which country showed the highest/lowest fluctuation in 14d-incidence within a year?

In [57]:
cdf["deltaTime_since_start_of_recording"] =cdf["date_reported"] - cdf["date_reported"].min()

In [108]:
for variable in list(cdf.describe().columns):
    fig = px.histogram(cdf, x=variable)
    fig.show()

In [119]:
print(cdf["popData2019"].max())
print(cdf["deaths_weekly"].min())
print(cdf["cases_weekly"].min())
cdf.info()

1433783692.0
0
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10423 entries, 0 to 10432
Data columns (total 11 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   year_week                           10423 non-null  object 
 1   cases_weekly                        10423 non-null  int64  
 2   deaths_weekly                       10423 non-null  int64  
 3   countriesAndTerritories             10423 non-null  object 
 4   geoId                               10423 non-null  object 
 5   countryterritoryCode                10398 non-null  object 
 6   popData2019                         10398 non-null  float64
 7   continentExp                        10423 non-null  object 
 8   14d-incidence                       10423 non-null  object 
 9   date_reported                       10423 non-null  object 
 10  deltaTime_since_start_of_recording  10423 non-null  object 
dtypes: float64(1), int64(2),

In [132]:
cdf[cdf["deaths_weekly"]<0] = 0
cdf[cdf["cases_weekly"]<0] = 0
cdf["14d-incidence"] = cdf["14d-incidence"].fillna(0)
cdf["14d-incidence"] = cdf["14d-incidence"].replace("", 0)
cdf["14d-incidence"] = cdf["14d-incidence"].astype(float)
cdf[cdf["14d-incidence"]<0] = 0
cdf = cdf[cdf["popData2019"]!=0]



In [163]:
cdf = cdf[cdf["date_reported"] < pd.to_datetime("2021-02-09")]

In [174]:
cdf.pivot_table(
    index=[
        "continentExp",
        "countriesAndTerritories", 

    ], 
    aggfunc={
        '14d-incidence' : [np.max, np.min]
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,14d-incidence,14d-incidence
Unnamed: 0_level_1,Unnamed: 1_level_1,amax,amin
continentExp,countriesAndTerritories,Unnamed: 2_level_2,Unnamed: 3_level_2
Africa,Algeria,33.78,0.0
Africa,Angola,10.50,0.0
Africa,Benin,5.75,0.0
Africa,Botswana,170.51,0.0
Africa,Burkina_Faso,11.39,0.0
...,...,...,...
Oceania,Papua_New_Guinea,2.43,0.0
Oceania,Solomon_Islands,1.34,0.0
Oceania,Vanuatu,0.33,0.0
Oceania,Wallis_and_Futuna,0.00,0.0


Create a line plot showing the 14-incidence for all European countries. Use groupby operation to generate the data list for the plotly plot.

Create a smoothed version of the 14d-incidence by averaging 3 months.

Create a radial plot of death rate / 100000 people (see popData2019), where one year completes a circle, i.e. 360˚. Visualize the recored years for Italy, Germany, Sweden and Greece. Hint you might need to turn the dateTime into day within the year (%j) and adjust 365 to 360 degrees.

Optional: Find "regular" mortality rates for those countries and visualize it in the plot as well.

In [164]:
df = cdf[["continentExp", "countriesAndTerritories", "14d-incidence"]].groupby("continentExp")

fig = px.line(cdf.loc[df.groups["Europe"]].sort_values("date_reported"), x="date_reported", y="14d-incidence", color='countriesAndTerritories')
fig.show()

In [177]:
from collections import deque 

grp = cdf[["continentExp", "countriesAndTerritories", "14d-incidence"]].groupby("continentExp")
sorted_df = cdf.loc[grp.groups["Europe"]].sort_values("date_reported")

for name, grp in sorted_df.groupby("countriesAndTerritories"):
    window = deque([], 90)
    averaged_list = []
    for day in grp["deltaTime_since_start_of_recording"]:
        window.append(1)
        average = sum(window)/len(window)
        averaged_list.append(average)
        
sorted_df.head()
#fig = px.line(sorted_df, x="date_reported", y="14d-incidence", color='countriesAndTerritories')
#fig.show()

Unnamed: 0,year_week,cases_weekly,deaths_weekly,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,14d-incidence,date_reported,deltaTime_since_start_of_recording
5699,2020-22,0,0,Liechtenstein,LI,LIE,38378.0,Europe,0.0,2020-01-06 00:00:00,0 days 00:00:00
92,2020-22,139,1,Albania,AL,ALB,2862427.0,Europe,6.67,2020-01-06 00:00:00,0 days 00:00:00
2504,2020-22,9,0,Cyprus,CY,CYP,875899.0,Europe,3.2,2020-01-06 00:00:00,0 days 00:00:00
4335,2020-22,0,0,Holy_See,VA,VAT,815.0,Europe,0.0,2020-01-06 00:00:00,0 days 00:00:00
7210,2020-22,102,1,Norway,NO,NOR,5328212.0,Europe,4.02,2020-01-06 00:00:00,0 days 00:00:00
