# Assignment 2.1
### Scrape and Analyse

* API [https://beautiful-soup-4.readthedocs.io/en/latest/](https://beautiful-soup-4.readthedocs.io/en/latest/)

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Tasks
Scrape data from the website [http://www.nationmaster.com](http://www.nationmaster.com/), convert it into Pandas data frames and use pandas queries to answer the following questions: 

#### 1
Get the number of internet users per country, remove all NaN entries and return the top 10 countries with the highest absolute number of internet users. 

In [None]:
#use request to get data from URL
res = requests.get("http://www.nationmaster.com/country-info/stats/Media/Internet-users")
#parse data as HTML
soup = BeautifulSoup(res.content,'lxml')
#extract tables
table = soup.find_all('table')[0]
#convert table to pandas data frame
df = pd.read_html(str(table)) #returns list of dataframes (one for each table)

In [None]:
df0 = df[0].drop(columns=['GRAPH', 'HISTORY'])

In [None]:
df0 = df0.dropna(subset=['#','COUNTRY','AMOUNT'])

In [None]:
df0 = df0.drop(columns=['#'])

In [None]:
df0

In [None]:
df0['AMOUNT'] = df0['AMOUNT'].replace({'million': '*1e6'}, regex=True)

In [None]:
df0['AMOUNT'] = df0['AMOUNT'].map(pd.eval)

In [None]:
df0.sort_values(by=['AMOUNT'], ascending=False).head(10)

#### 2
Get the number of internet users per country, remove all NaN entries and return the top 10 countries with the highest number of internet users relative to the populutation. Hint: you need to scrape the population number from another page)

In [None]:
#use request to get data from URL
res = requests.get("http://www.nationmaster.com/country-info/stats/People/Population")
#parse data as HTML
soup = BeautifulSoup(res.content,'lxml')
#extract tables
table = soup.find_all('table')[0]
#convert table to pandas data frame
df_pop_list = pd.read_html(str(table)) #returns list of dataframes (one for each table)

In [None]:
df_pop = df_pop_list[0]

In [None]:
df_pop = df_pop.drop(columns=['GRAPH', 'HISTORY'])
df_pop = df_pop.dropna(subset=['#','COUNTRY','AMOUNT'])
df_pop = df_pop.drop(columns=['#'])

In [None]:
df_pop

In [None]:
df_pop['AMOUNT'] = df_pop['AMOUNT'].replace({'million': '*1e6','billion': '*1e9'}, regex=True).map(pd.eval)

In [None]:
result = pd.merge(df0, df_pop, on="COUNTRY")

In [None]:
result

In [None]:
result['relative'] = result.apply(lambda x: x['AMOUNT_x'] / x['AMOUNT_y'], axis=1)

In [None]:
result

In [None]:
result.sort_values(by=['relative'], ascending=False).head(10)

#### 3
Compute the correlation between the crime rate (murders per 100k) and the education level. Compare this to the correlation of crime rate and poverty (relative BIP). Hint: use pandas build in correlation function: [https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html)

In [None]:
#use request to get data from URL
res = requests.get("https://www.nationmaster.com/country-info/stats/Crime/Violent-crime/Murder-rate")
#parse data as HTML
soup = BeautifulSoup(res.content,'lxml')
#extract tables
table = soup.find_all('table')[0]
#convert table to pandas data frame
df_murder_list = pd.read_html(str(table)) #returns list of dataframes (one for each table)

In [None]:
df_murder = df_murder_list[0]

In [None]:
#use request to get data from URL
res = requests.get("https://www.nationmaster.com/country-info/stats/Education/High-school-enrolment-rate")
#parse data as HTML
soup = BeautifulSoup(res.content,'lxml')
#extract tables
table = soup.find_all('table')[0]
#convert table to pandas data frame
df_edu_list = pd.read_html(str(table)) #returns list of dataframes (one for each table)

In [None]:
df_edu = df_edu_list[0]

In [None]:
df_murder = df_murder.drop(columns=['GRAPH', 'HISTORY','DATE'])
df_murder = df_murder.dropna(subset=['#','COUNTRY','AMOUNT'])
df_murder = df_murder.drop(columns=['#'])

df_edu = df_edu.drop(columns=['GRAPH', 'HISTORY','DATE'])
df_edu = df_edu.dropna(subset=['#','COUNTRY','AMOUNT'])
df_edu = df_edu.drop(columns=['#'])

In [None]:
df_murder = df_murder.rename(columns={"AMOUNT": "murder-100k"})
df_edu = df_edu.rename(columns={"AMOUNT": "highschool-rate"})

df_murder_edu = pd.merge(df_murder, df_edu, on="COUNTRY")

In [None]:
df_murder_edu

In [None]:
df_murder_edu.corr()

### REST API
#### Using data from [https://www.energidataservice.dk](https://www.energidataservice.dk) 

In [None]:
import pandas as pd
import requests
from pandas import json_normalize

In [None]:
#get data from an open energy data service provider
url = 'https://www.energidataservice.dk/proxy/api/datastore_search?resource_id=nordpoolmarket&limit=500'

response = requests.get(url)
dictr = response.json() #parse json to dict
recs = dictr['result']['records'] 
df = json_normalize(recs) #flatten json files into data frame
df.head()

#### 4
Compute overview statistics (mean, variance, quantiles, counts,...) for all variables. Hint: there is a single pandas call to get this ...

In [None]:
df.describe()

#### 5 
Compute the average ***SpotSale*** by each day. 

In [None]:
df['Date'] = pd.to_datetime(df['HourUTC'])
df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

In [None]:
df.groupby('Date')['SpotSale'].mean()

#### 6 
Compute the day with the highest variance in ***SpotPurchase***

In [None]:
df.groupby('Date')['SpotPurchase'].var().idxmax()