# This Jupyter Notebook will only cover tasks which have nothing to do with the Story Telling

## API

The team tried working with an API but sadly the requests lead to an overhead and for other APIs which would have provided the data needed, subscriptions or licenses had to be bought.

In [None]:
import pandas as pd
import requests
import json
import datetime
import time

In [None]:
# we need the country list
# country list is parted in three seperate lists to keep the rate limit low


# keep only in eu since those are also the regulation countries and bigger servers are located there as well
europe_country_list = ["Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark", "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Italy", "Latvia", "Liechtenstein", "Lithuania", "Luxembourg", "Malta", "Netherlands", "Norway", "Poland", "Portugal" , "Romania", "Slovakia", "Slovenia", "Spain", "Sweden" ]
# europe_country_list = ["Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belarus", "Belgium", "Bosnia and Herzegovina", "Bulgaria", "Croatia", "Cyprus", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Georgia", "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Italy", "Kazakhstan", "Latvia", "Liechtenstein", "Lithuania", "Luxembourg", "Macedonia", "Malta", "Moldova", "Monaco", "Montenegro", "Netherlands", "Norway", "Poland","Portugal","Romania", "Russia", "San Marino", "Serbia", "Slovakia", "Slovenia", "Spain", "Sweden", "Switzerland", "Turkey", "Ukraine","United Kingdom", "Vatican City"]

print(f"Number of Countries: {len(europe_country_list)}" )

In [None]:
url = "https://covid-193.p.rapidapi.com/history"

headers = {
	"X-RapidAPI-Key": "ba735a5542msh0672a6a248c9225p1ba64djsnfd32c8982360",
	"X-RapidAPI-Host": "covid-193.p.rapidapi.com"
}

### March Data

In [None]:
# collect the results for each country in that specific timeframe
# march
start_date = datetime.date(2020, 3, 1)
end_date = datetime.date(2020, 4, 1)
delta = datetime.timedelta(days=1)

for country in europe_country_list:
    j = []
    while start_date <= end_date:
        # add a sleep into the while loop to let requests go through better
        time.sleep(10)
        querystring = {"country":f"{country}","day":f"{start_date}"}
        result = requests.request("GET", url, headers=headers, params=querystring)
        country_json = result.json()
        # put it into the countries json
        j.append(country_json)
        start_date += delta
    # save the json
    with open(f"covid_data/api_data/march/{country}_march.json", "w") as jsonfile:
        json.dump(j, jsonfile)
    # set start date again
    start_date = datetime.date(2020, 3, 1)

### April Data

In [None]:
# april
start_date = datetime.date(2020, 4, 1)
end_date = datetime.date(2020, 5, 1)
delta = datetime.timedelta(days=1)

for country in europe_country_list:
    j = []
    while start_date <= end_date:
        time.sleep(10)
        querystring = {"country":f"{country}","day":f"{start_date}"}
        result = requests.request("GET", url, headers=headers, params=querystring)
        country_json = result.json()
        # put it into the countries json
        j.append(country_json)
        start_date += delta
    # save the json
    with open(f"covid_data/api_data/april/{country}_april.json", "w") as jsonfile:
        json.dump(j, jsonfile)
    start_date = datetime.date(2020, 4, 1)

### Mai Data

In [None]:
# mai
start_date = datetime.date(2020, 5, 1)
end_date = datetime.date(2020, 6, 1)
delta = datetime.timedelta(days=1)

for country in europe_country_list:
    j = []
    while start_date <= end_date:
        querystring = {"country":f"{country}","day":f"{start_date}"}
        result = requests.request("GET", url, headers=headers, params=querystring)
        country_json = result.json()
        # put it into the countries json
        j.append(country_json)
        start_date += delta
    time.sleep(20)
    # save the json
    with open(f"covid_data/api_data/mai/{country}_mai.json", "w") as jsonfile:
        json.dump(j, jsonfile)
    start_date = datetime.date(2020, 5, 1)

Now that all the data has been collected we should also take a look upon what exactly has been collected and what the data consists of. For each country in our Europe List we collected the *History* of each country. When collecting the history one gets the following data for each date passed:
- day
- number of results
- response : continent, country, population, cases, deaths, tests

We are most interested in the cases parameter alongside with which regulations where present during that time.
When looking at the cases for each date per country we get the following:
- new cases
- active cases
- critical cases
- recovered cases
- total cases

Let's take a look at one example:

In [None]:
import json

with open('covid_data/api_data/april/Albania_april.json', 'r') as handle:
    parsed = json.load(handle)

print(json.dumps(parsed, indent=4))

As one can also see: for each date given we get multiple statistical responses since the covid data was obviously updated regularly over the day. We will use the latest given values of the day for our inspections. Furthermore we will only use the number of total cases, since the number of new cases has been stated differently per country ( either they say how much in comparison to before or the total number + the new number ).

So to make it easier for us we will create 3 Dataframes, in which each dataframe represents one month with countries, dates and the total cases of that date.

### April DF

In [None]:
# iterate through the folder, parse each file and add the information into the dataframe
import os
# assign directory
directory = 'covid_data/api_data/april'
# empty dataframe
april_df = pd.DataFrame(columns = ["Country", "DateTime", "Total Cases"])

# iterate over files in
# that directory
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    with open(f, 'r') as handle:
        parsed = json.load(handle)
        for info in parsed:
            # country
            country = info['parameters']["country"]
            # total numbers
            total_cases = info['response'][0]["cases"]["total"]
            # date and time
            date_time = info['response'][0]["time"]
            # add this into the dataframe
            april_df = april_df.append({'Country' : country, 'DateTime' : date_time, 'Total Cases' : total_cases}, ignore_index = True)
    print(country)

### March DF

In [None]:
# iterate through the folder, parse each file and add the information into the dataframe
import os
# assign directory
directory = 'covid_data/api_data/march'

march_df = pd.DataFrame(columns = ["Country", "DateTime", "Total Cases"])

# iterate over files in
# that directory
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    with open(f, 'r') as handle:
        parsed = json.load(handle)
        for info in parsed:
            # country
            country = info['parameters']["country"]
            # total numbers
            total_cases = info['response'][0]["cases"]["total"]
            # date and time
            date_time = info['response'][0]["time"]
            # add this into the dataframe
            march_df = march_df.append({'Country' : country, 'DateTime' : date_time, 'Total Cases' : total_cases}, ignore_index = True)
    print(country)

### Mai DF

In [None]:
# iterate through the folder, parse each file and add the information into the dataframe
import os
# assign directory
directory = 'covid_data/api_data/mai'

mai_df = pd.DataFrame(columns = ["Country", "DateTime", "Total Cases"])

# iterate over files in
# that directory
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    with open(f, 'r') as handle:
        parsed = json.load(handle)
        for info in parsed:
            # country
            country = info['parameters']["country"]
            # total numbers
            total_cases = info['response'][0]["cases"]["total"]
            # date and time
            date_time = info['response'][0]["time"]
            # add this into the dataframe
            mai_df = mai_df.append({'Country' : country, 'DateTime' : date_time, 'Total Cases' : total_cases}, ignore_index = True)
    print(country)

## Map Reduce and Spark

### Map Reduce

We tried out tiny jobs with Map Reduce on the dataframe of regulations.
For that we will focus only on the three specific Months starting with March.

In [1]:
pip install mrjob

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import matplotlib as plt
import pandas as pd
from mrjob.job import MRJob
import datetime

How often does each country call upon regulations:

In [10]:
%%file mrjob_job2.py
from mrjob.job import MRJob
from mrjob.step import MRStep

class MovieLensData_Analysis(MRJob):
    
    def mapper(self, _, line):
        words = line.split(',')
        yield words[0], 1
    
    def reducer(self, country, values):
        yield (country, sum(values))
        
        
if __name__ == '__main__':
    MovieLensData_Analysis.run()

Writing mrjob_job2.py


In [11]:
%run mrjob_job2.py covid_data/country_regulations.csv

No configs found; falling back on auto-configuration
No configs found; falling back on auto-configuration
No configs found; falling back on auto-configuration
No configs found; falling back on auto-configuration
No configs specified for inline runner
No configs specified for inline runner
No configs specified for inline runner
No configs specified for inline runner
Creating temp directory C:\Users\chiar\AppData\Local\Temp\mrjob_job2.chiar.20221115.030646.697936
Creating temp directory C:\Users\chiar\AppData\Local\Temp\mrjob_job2.chiar.20221115.030646.697936
Creating temp directory C:\Users\chiar\AppData\Local\Temp\mrjob_job2.chiar.20221115.030646.697936
Creating temp directory C:\Users\chiar\AppData\Local\Temp\mrjob_job2.chiar.20221115.030646.697936
Running step 1 of 1...
Running step 1 of 1...
Running step 1 of 1...
Running step 1 of 1...
job output is in C:\Users\chiar\AppData\Local\Temp\mrjob_job2.chiar.20221115.030646.697936\output
job output is in C:\Users\chiar\AppData\Local\Temp

"\"Austria\""	83
"\"Belgium\""	67
"\"Bulgaria\""	67
"\"Country\""	1
"\"Croatia\""	44
"\"Cyprus\""	70
"\"Czechia\""	97
"\"Denmark\""	72
"\"Estonia\""	79
"\"Finland\""	38
"\"France\""	65
"\"Germany\""	63
"\"Greece\""	68
"\"Hungary\""	55
"\"Iceland\""	72
"\"Ireland\""	75
"\"Italy\""	76
"\"Latvia\""	62
"\"Liechtenstein\""	57
"\"Lithuania\""	88
"\"Luxembourg\""	56
"\"Malta\""	49
"\"Netherlands\""	112
"\"Norway\""	60
"\"Poland\""	81
"\"Portugal\""	57
"\"Romania\""	59
"\"Slovakia\""	75
"\"Slovenia\""	80
"\"Spain\""	61
"\"Sweden\""	33


Removing temp directory C:\Users\chiar\AppData\Local\Temp\mrjob_job2.chiar.20221115.030646.697936...
Removing temp directory C:\Users\chiar\AppData\Local\Temp\mrjob_job2.chiar.20221115.030646.697936...
Removing temp directory C:\Users\chiar\AppData\Local\Temp\mrjob_job2.chiar.20221115.030646.697936...
Removing temp directory C:\Users\chiar\AppData\Local\Temp\mrjob_job2.chiar.20221115.030646.697936...


### Spark

In [15]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, count, when, col, desc, udf, col, sort_array, asc, avg
from pyspark.sql.functions import sum as Fsum
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType

In [17]:
spark = SparkSession \
    .builder \
    .appName("task") \
    .getOrCreate()

In [25]:
df = spark.read.csv("covid_data/country_regulations.csv")

In [26]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [27]:
# get the number of how often all Events were prohibited
df.filter(df._c1 == 'BanOnAllEvents') \
    .select('_c0', '_c1') \
    .dropDuplicates() \
    .count()

25

In [28]:
# get the number of how often all Mass Gatherings were prohibited
df.filter(df._c1 == 'MassGatherAll') \
    .select('_c0', '_c1') \
    .dropDuplicates() \
    .count()

30

In [None]:
# get the number of how often all Entertainment Venues were prohibited
df.filter(df._c1 == 'EntertainmentVenues') \
    .select('_c0', '_c1') \
    .dropDuplicates() \
    .count()