In [1]:
import subprocess
import os

import pandas as pd

import requests
from bs4 import BeautifulSoup

import json


pd.set_option('display.max_rows', 500)

![CRISP_DM](../reports/figures/CRISP_DM.png)

# Data Understanding

* RKI, webscrape (webscraping) https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GITHUB) https://github.com/CSSEGISandData/COVID-19.git
* REST API services to retreive data https://npgeo-corona-npgeo-de.hub.arcgis.com/

## GITHUB csv data

git clone/pull https://github.com/CSSEGISandData/COVID-19.git

In [2]:

git_pull = subprocess.Popen( "/usr/bin/git pull" , 
                     cwd = os.path.dirname( '../data/raw/COVID-19/' ), 
                     shell = True, 
                     stdout = subprocess.PIPE, 
                     stderr = subprocess.PIPE )
(out, error) = git_pull.communicate()


print("Error : " + str(error)) 
print("out : " + str(out))

Error : b'The system cannot find the path specified.\r\n'
out : b''


In [3]:
data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)

In [4]:
pd_raw.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/10/22,6/11/22,6/12/22,6/13/22,6/14/22,6/15/22,6/16/22,6/17/22,6/18/22,6/19/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,180864,180864,180864,181120,181178,181236,181465,181534,181574,181666
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,276638,276690,276731,276731,276821,276821,276821,277141,277141,277409
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,265925,265925,265927,265937,265943,265952,265964,265968,265971,265975
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,43224,43224,43224,43224,43224,43449,43449,43449,43449,43449
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,99761,99761,99761,99761,99761,99761,99761,99761,99761,99761


## Webscrapping

In [5]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [6]:
soup = BeautifulSoup(page.content, 'html.parser')

In [14]:
html_table=soup.find('table') # find the table, attention this works if one table exists


In [8]:
all_rows=html_table.find_all('tr')

In [17]:
final_data_list=[]


In [19]:
for pos,rows in enumerate(all_rows):
   
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')] #td for data element
    final_data_list.append(col_list)
    

    

In [22]:
pd_daily_status=pd.DataFrame(final_data_list).dropna().rename(columns={0:'state',
                                                       1:'cases',
                                                       2:'changes',
                                                       3:'cases_per_100k',
                                                       4:'fatal',
                                                       5:'comment'})
print(pd_daily_status)

                       state       cases changes cases_per_100k  fatal  \
2         Baden-Württem­berg   3.756.405       0         30.514  274,8   
3                     Bayern   5.019.301       0         46.998  357,7   
4                     Berlin   1.075.132       0         10.776  294,1   
5               Branden­burg     806.027       0          7.123  281,4   
6                     Bremen     208.715     248          3.632  534,0   
7                    Hamburg     608.227       0          7.688  415,0   
8                     Hessen   1.948.045       0         35.461  563,5   
9   Meck­lenburg-Vor­pommern     499.015       0          5.045  313,2   
10            Nieder­sachsen   2.496.938       0         52.451  655,4   
11     Nord­rhein-West­falen   5.502.630   6.693         85.229  475,5   
12          Rhein­land-Pfalz   1.199.467       0         18.020  439,7   
13                  Saarland     325.175       0          4.391  446,2   
14                   Sachsen   1.532.9

In [21]:
pd_daily_status.head()

Unnamed: 0,state,cases,changes,cases_per_100k,fatal,comment
2,Baden-Württem­berg,3.756.405,0,30.514,2748,16.225
3,Bayern,5.019.301,0,46.998,3577,24.232
4,Berlin,1.075.132,0,10.776,2941,4.636
5,Branden­burg,806.027,0,7.123,2814,5.703
6,Bremen,208.715,248,3.632,5340,782.0


## REST API calls

In [23]:
## data request for Germany
data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [14]:
json_object=json.loads(data.content) 

In [15]:
type(json_object)

dict

In [16]:
json_object.keys()

dict_keys(['objectIdFieldName', 'uniqueIdField', 'globalIdFieldName', 'geometryProperties', 'serverGens', 'geometryType', 'spatialReference', 'fields', 'features'])

In [17]:
full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])
    

In [18]:
pd_full_list=pd.DataFrame(full_list)
pd_full_list.head()

Unnamed: 0,OBJECTID_1,LAN_ew_AGS,LAN_ew_GEN,LAN_ew_BEZ,LAN_ew_EWZ,OBJECTID,Fallzahl,Aktualisierung,AGS_TXT,GlobalID,faelle_100000_EW,Shape__Area,Shape__Length,Death
0,1,1,Schleswig-Holstein,Land,2896712,15,2414,1587420000000,1,fc5ba936-c95c-432c-8a33-9eb2f30b660f,83.335865,45737310000.0,2881496.0,70
1,2,2,Hamburg,Freie und Hansestadt,1841179,6,4204,1587420000000,2,0f3e860c-5181-4d3f-a421-1d51f50315ea,228.331955,2089396000.0,418800.2,91
2,3,3,Niedersachsen,Land,7982448,9,9098,1587420000000,3,3fd77024-c29b-4843-9be8-682ad48e60c9,113.975061,129983600000.0,4008988.0,306
3,4,4,Bremen,Freie Hansestadt,682986,5,609,1587420000000,4,4132268b-54de-4327-ac1e-760e915112f1,89.167274,1119157000.0,335717.7,25
4,5,5,Nordrhein-Westfalen,Land,17932651,10,29389,1587420000000,5,561d658f-3ee5-46e3-bc95-3528c6558ab9,163.885418,87829360000.0,2648673.0,896


In [19]:

pd_full_list.to_csv('../data/raw/NPGEO/GER_state_data.csv',sep=';')

In [20]:
pd_full_list.shape[0]

16

# API access via REST service, e.g. USA data 

example of a REST conform interface (attention registration mandatory)

www.smartable.ai

In [26]:


# US for full list
headers = {
    'Cache-Control': 'no-cache',
    'Subscription-Key': '28ee4219700f48718be78b057beb7eb4',
}

response = requests.get('https://api.smartable.ai/coronavirus/stats/US', headers=headers)
print(response)

ConnectionError: HTTPSConnectionPool(host='api.smartable.ai', port=443): Max retries exceeded with url: /coronavirus/stats/US (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002144A19B670>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [25]:

US_dict=json.loads(response.content) # imports string
with open('../data/raw/SMARTABLE/US_data.json', 'w') as outfile:
    json.dump(US_dict, outfile,indent=2)

NameError: name 'response' is not defined

In [23]:
print(json.dumps(US_dict,indent=2)) #string dump

{
  "location": {
    "long": -95.712891,
    "countryOrRegion": "United States",
    "provinceOrState": null,
    "county": null,
    "isoCode": "US",
    "lat": 37.09024
  },
  "updatedDateTime": "2020-04-21T06:45:14.5523523Z",
  "stats": {
    "totalConfirmedCases": 789584,
    "newlyConfirmedCases": 28176,
    "totalDeaths": 42451,
    "newDeaths": 1770,
    "totalRecoveredCases": 67605,
    "newlyRecoveredCases": 1872,
    "history": [
      {
        "date": "2020-01-22T00:00:00",
        "confirmed": 1,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-23T00:00:00",
        "confirmed": 1,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-24T00:00:00",
        "confirmed": 2,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-25T00:00:00",
        "confirmed": 2,
        "deaths": 0,
        "recovered": 0
      },
      {
        "date": "2020-01-26T00:00:00",
        "con

# Individual States US

In [24]:
US_dict['stats']['breakdowns'][0]

{'location': {'long': -86.902298,
  'countryOrRegion': 'United States',
  'provinceOrState': 'Alabama',
  'county': None,
  'isoCode': 'US-AL',
  'lat': 32.318231},
 'totalConfirmedCases': 5078,
 'newlyConfirmedCases': 175,
 'totalDeaths': 164,
 'newDeaths': 4,
 'totalRecoveredCases': 0,
 'newlyRecoveredCases': 0}

In [25]:
full_list_US_country=[]
for pos,each_dict in enumerate (US_dict['stats']['breakdowns'][:]):
    flatten_dict=each_dict['location']
    flatten_dict.update(dict(list(US_dict['stats']['breakdowns'][pos].items())[1: 7]) 
    )
    full_list_US_country.append(flatten_dict)

In [26]:
pd.DataFrame(full_list_US_country).to_csv('../data/raw/SMARTABLE/full_list_US_country.csv',sep=';',index=False)