In [None]:
import pandas as pd
import json

pd.set_option('display.max_rows', 500)

import requests
from bs4 import BeautifulSoup

# Data Understanding

## Data Sources
* RKI, webscraping https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html
* John Hopkins (GIT) https://github.com/CSSEGISandData/COVID-19.git
* Rest API to retrieve covid data from NPGEO https://npgeo-corona-npgeo-de.hub.arcgis.com/


###  John Hopkins Source

In [None]:
data_path='../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
pd_raw=pd.read_csv(data_path)

In [None]:
pd_raw.head()

### Web Scraping from RKI


In [None]:
page = requests.get("https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Fallzahlen.html")

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')

html_table = soup.find('table')
all_rows = html_table.find_all('tr')
print(all_rows)

In [None]:
final_table_data = []
for pos,rows in enumerate(all_rows):
    col_list=[each_col.get_text(strip=True) for each_col in rows.find_all('td')] #td for data element
    final_table_data.append(col_list)
print(final_table_data)

In [None]:
pd.DataFrame(final_table_data).dropna().rename(columns={0:'state',
                                                       1:'cases',
                                                       2:'changes',
                                                       3:'cases_per_100k',
                                                       4:'fatality',
                                                       5:'comment'}).head()

## REST API CALLS

In [None]:
## data request for Germany
data=requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

In [None]:
json_object=json.loads(data.content) 
json_object.keys()

In [None]:
full_list=[]
for pos,each_dict in enumerate (json_object['features'][:]):
    full_list.append(each_dict['attributes'])

In [None]:
pd_full_list=pd.DataFrame(full_list)
pd_full_list.head()

In [None]:
pd_full_list.to_csv('../data/raw/NPGEO/GER_state_data.csv',sep=';')
pd_full_list

In [None]:
pd_full_list.shape[0]

## API access via REST service, e.g. USA data 


In [None]:
# US for full list
headers = {
    'Cache-Control': 'no-cache',
    'Subscription-Key': '',
}

response = requests.get('https://api.smartable.ai/coronavirus/stats/US', headers=headers)
if response.status_code != 200:
    print("Something Wrong with Request!!")
else: 
    print("Request Success!")

In [None]:
US_dict=json.loads(response.content) # imports string
with open('../data/raw/SMARTABLE/US_data.json', 'w') as outfile:
    json.dump(US_dict, outfile,indent=2)

In [None]:
print(json.dumps(US_dict,indent=2))

In [None]:
full_list_US_country=[]
for pos,each_dict in enumerate (US_dict['stats']['breakdowns'][:]):
    flatten_dict=each_dict['location']
    flatten_dict.update(dict(list(US_dict['stats']['breakdowns'][pos].items())[1: 7]) 
    )
    full_list_US_country.append(flatten_dict)

In [None]:
pd.DataFrame(full_list_US_country).to_csv('../data/raw/SMARTABLE/full_list_US_country.csv',sep=';',index=False)