In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
# import the KMeans clustering model from scikit-learn
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('https://mda-project-poland.s3.eu-west-3.amazonaws.com/ultimate+aquastat+(flat).csv')  # will be used for Time series analysis
df_GDP = pd.read_csv('https://mda-project-poland.s3.eu-west-3.amazonaws.com/GDP.csv')
df_2018 = df.loc[df['Year'] == 2018]  # will be used for classification

In [None]:

table_2018 = pd.pivot_table(data=df_2018,values='Value',index='Area',columns='Variable Name')

# drop undernourishment and add GDP  
table_2018 = table_2018.drop('Prevalence of undernourishment (3-year average)', axis=1)  # TODO download correct data from AQUASTAT instead of dropping here
df_GDP_2018 = df_GDP.loc[df_GDP['Year'] == 2018] 
table_GDP_2018 = pd.pivot_table(data=df_GDP_2018,values='Value',index='Area',columns='Variable Name')

table_2018 = table_2018.merge(table_GDP_2018, how='inner', on='Area')

table_2018.head() 

In [None]:
table=pd.pivot_table(data=df,values='Value',index=['Area','Year'],columns='Variable Name')

# drop undernourishment and add GDP  
table = table.drop('Prevalence of undernourishment (3-year average)', axis=1)  # TODO download correct data from AQUASTAT instead of dropping here
table_GDP = pd.pivot_table(data=df_GDP,values='Value',index=['Area','Year'],columns='Variable Name')
table
table = table.merge(table_GDP, how='inner', on=['Area','Year'])
table.head(21)

## Data cleaning

The only missing value in the 2018 dataset is the National Rainfall Index (NRI) in North Korea.

In [None]:
table_2018_preprocessed = table_2018.fillna(table_2018.mean())
# TODO maybe replace with some manually calculated value (using other sources of data) instead of mean

The complete timetable has missing values for the oldest observations. By looking at the data, one notices a certain trend over the years for each variable and each country. This idea is used to fill the missing values. 

Using interpolation to fill the data gaps would be ideal. Since our dataset consists of MultiIndexes, a linear interpolation is the only possible option. However, linear interpolation is not useful in this situation, since the missing values occur only in the oldest observation. Those values are unfortunately not linearly interpolated, simply because there is no value to base the 'inter'polation on.

Hence, 'bfill' is used to fill the data gaps. A possible improvement here would be to create a dataframe using a single index (e.g. a dataframe per country and 'Year' as index) and to perform a spline interpolation, which can extrapolate NaNs at the tail of a Series/Dataframe.

In [None]:
# # Could be done like this if dataframe has no MultiIndexes
# rev_table = table.iloc[::-1]
# rev_table_preprocessed = rev_table.interpolate(method='spline', order=2, axis=0)
# table = rev_table_preprocessed.iloc[::-1]

table_preprocessed = table.fillna(method='bfill')
table_preprocessed.head(21)