In [92]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
# import the KMeans clustering model from scikit-learn
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
%matplotlib inline

In [93]:
df = pd.read_csv('https://mda-project-poland.s3.eu-west-3.amazonaws.com/ultimate+aquastat+(flat).csv')
df_GDP = pd.read_csv('https://mda-project-poland.s3.eu-west-3.amazonaws.com/GDP.csv')
df.shape

(4924, 9)

In [94]:
df_2018 = df.loc[df['Year'] == 2018] 
table_2018=pd.pivot_table(data=df_2018,values='Value',index='Area',columns='Variable Name')

GDP_2018 = df_GDP.loc[df_GDP['Year'] == 2018] 
GDP_table_2018=pd.pivot_table(data=GDP_2018,values='Value',index='Area',columns='Variable Name')

table_2018 = table_2018.merge(GDP_table_2018, how='inner', on='Area') # is used for classification
table_2018.head(14)

Variable Name,Agricultural water withdrawal as % of total renewable water resources,"Agriculture, value added (% GDP)",Environmental Flow Requirements,GDP per capita,"Industry, value added to GDP",Long-term average annual precipitation in volume,MDG 7.5. Freshwater withdrawal as % of total renewable water resources,National Rainfall Index (NRI),Population density,Prevalence of undernourishment (3-year average),SDG 6.4.1. Industrial Water Use Efficiency,SDG 6.4.1. Irrigated Agriculture Water Use Efficiency,SDG 6.4.1. Services Water Use Efficiency,SDG 6.4.1. Water Use Efficiency,SDG 6.4.2. Water Stress,"Services, value added to GDP",Total population with access to safe drinking-water (JMP),Total renewable water resources per capita,Total water withdrawal per capita,Gross Domestic Product (GDP)
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Afghanistan,30.613807,22.440899,28.29,563.983834,4798615000.0,213.48522,31.045462,185.0,56.937048,23.0,31.220059,0.102132,57.683901,0.935917,54.757019,10639420000.0,55.3,1757.509385,548.072294,17986970000.0
Argentina,3.187483,6.097129,515.8,11601.890079,107210000000.0,1643.2164,4.301333,1062.0,15.954953,3.1,32.596217,0.152694,62.519941,13.244872,10.456664,300734000000.0,99.1,19752.418501,851.646091,519871700000.0
Australia,2.444073,2.121597,243.3,58689.490194,302321000000.0,4133.81148,2.354494,717.1,3.216309,,91.638244,0.418891,405.62637,70.079929,4.657865,1014570000000.0,100.0,19760.502707,697.30878,1458953000000.0
Azerbaijan,32.406633,5.212381,12.03,4729.900727,23976880000.0,38.7102,35.088681,459.7,114.890727,2.5,50.317916,0.159418,46.604281,3.870952,53.7293,16833890000.0,87.0,3485.086793,1222.870974,47112480000.0
Bolivia (Plurinational State of),0.334495,11.481924,396.6,3592.084429,9721095000.0,1258.97268,0.363763,1503.0,10.334379,12.5,281.746033,0.248811,140.699591,13.71108,1.177001,20631900000.0,90.0,50558.691154,183.913845,40287650000.0
Brazil,0.434255,4.418538,4105.0,8861.531858,302661000000.0,14996.27097,0.747196,1934.0,24.597814,2.6,28.217631,0.451839,68.242864,22.523559,1.422501,1212280000000.0,98.1,41280.507695,308.350641,1885469000000.0
Canada,0.090937,1.834932,1931.0,46343.371942,371962000000.0,5305.42575,1.22674,693.6,3.752581,,13.030472,0.411861,235.05463,43.063121,3.666323,1192630000000.0,99.8,78274.694115,944.771782,1721906000000.0
Central African Republic,0.000284,31.241338,119.4,473.499434,556437300.0,836.66214,0.051418,1331.0,7.490398,46.1,40.205123,0.128814,13.895263,18.174045,0.335648,963152000.0,68.5,30216.219552,15.536709,2220979000.0
China,13.56233,7.164207,1471.0,9855.461832,5546710000000.0,6192.008321,20.836414,1049.0,152.018297,8.5,43.689158,2.18792,93.241371,23.538964,43.221688,7040600000000.0,95.5,1946.18581,409.832243,14311690000000.0
Democratic People's Republic of Korea,8.567725,23.276657,45.94,686.2093,6689776000.0,127.04916,11.222035,,211.959549,42.9,5.227575,0.363584,6.666936,1.664138,27.740468,6727035000.0,99.7,3019.616273,338.862395,17487260000.0


In [95]:
# 1 NaN in NRI
# multiple NaNs in undernourishment 
# used median instead mean because undernourishment is a right-skewed distribution, thus the average would be too much influenced by
# countries with large undernourishment
table_2018_preprocessed = table_2018.fillna(table_2018.median())
table_2018_preprocessed.isna().sum()

Variable Name
Agricultural water withdrawal as % of total renewable water resources     0
Agriculture, value added (% GDP)                                          0
Environmental Flow Requirements                                           0
GDP per capita                                                            0
Industry, value added to GDP                                              0
Long-term average annual precipitation in volume                          0
MDG 7.5. Freshwater withdrawal as % of total renewable water resources    0
National Rainfall Index (NRI)                                             0
Population density                                                        0
Prevalence of undernourishment (3-year average)                           0
SDG 6.4.1. Industrial Water Use Efficiency                                0
SDG 6.4.1. Irrigated Agriculture Water Use Efficiency                     0
SDG 6.4.1. Services Water Use Efficiency                                  

In [96]:
table=pd.pivot_table(data=df,values='Value',index=['Area','Year'],columns='Variable Name') # is used for Time Series Analysis
GDP_table = pd.pivot_table(data=df_GDP,values='Value',index=['Area','Year'],columns='Variable Name')

table = table.merge(GDP_table, how='inner', on=['Area','Year'])
table.head(25)

Unnamed: 0_level_0,Variable Name,Agricultural water withdrawal as % of total renewable water resources,"Agriculture, value added (% GDP)",Environmental Flow Requirements,GDP per capita,"Industry, value added to GDP",Long-term average annual precipitation in volume,MDG 7.5. Freshwater withdrawal as % of total renewable water resources,National Rainfall Index (NRI),Population density,Prevalence of undernourishment (3-year average),SDG 6.4.1. Industrial Water Use Efficiency,SDG 6.4.1. Irrigated Agriculture Water Use Efficiency,SDG 6.4.1. Services Water Use Efficiency,SDG 6.4.1. Water Use Efficiency,SDG 6.4.2. Water Stress,"Services, value added to GDP",Total population with access to safe drinking-water (JMP),Total renewable water resources per capita,Total water withdrawal per capita,Gross Domestic Product (GDP)
Area,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,1992.0,35.498101,54.207106,28.29,248.51889,552849000.0,213.48522,35.91048,304.98,22.187829,29.5,25.311413,0.092143,10.237171,0.2404,63.337788,995251000.0,21.3,4510.012947,1619.567311,3444711000.0
Afghanistan,1997.0,31.427856,62.416768,28.29,164.092893,422434100.0,213.48522,31.852952,233.0,29.649735,45.6,9.091408,0.08582,7.763338,0.19459,56.181246,652758000.0,25.0,3374.984489,1075.032191,2924589000.0
Afghanistan,2002.0,30.613807,45.134344,28.29,194.958382,915134500.0,213.48522,31.045462,185.0,34.618096,47.8,11.426285,0.07477,15.252615,0.285607,54.757019,1465904000.0,33.8,2890.609479,899.030431,4141524000.0
Afghanistan,2007.0,30.613807,35.265942,28.29,389.985586,2599214000.0,213.48522,31.045462,185.0,41.510486,33.3,20.815284,0.110353,26.876683,0.549846,54.757019,4027325000.0,42.6,2410.653428,751.752659,9412162000.0
Afghanistan,2012.0,30.613807,28.556142,28.29,694.885618,4363018000.0,213.48522,31.045462,185.0,47.730564,24.7,23.227115,0.103801,47.639557,0.770774,54.757019,10738430000.0,51.6,2096.505623,653.786919,19136500000.0
Afghanistan,2017.0,30.613807,25.57467,28.29,605.557362,4502380000.0,213.48522,31.045462,185.0,55.595553,22.2,27.851985,0.118946,56.384791,0.911431,54.757019,10937780000.0,55.3,1799.917253,561.297018,18623030000.0
Afghanistan,2018.0,30.613807,22.440899,28.29,563.983834,4798615000.0,213.48522,31.045462,185.0,56.937048,23.0,31.220059,0.102132,57.683901,0.935917,54.757019,10639420000.0,55.3,1757.509385,548.072294,17986970000.0
Argentina,1992.0,,5.193747,515.8,7366.759988,60791320000.0,1643.0,3.243937,938.371429,12.059174,,,,,,7.886104,157832000000.0,94.3,26133.540531,847.773175,247929400000.0
Argentina,1997.0,2.455948,4.527197,515.8,8861.217104,67971350000.0,1643.0,3.343179,887.3,12.824568,,29.856348,0.08077,55.788386,12.184452,8.127364,194058000000.0,95.6,24573.84126,822.584545,317532600000.0
Argentina,2002.0,2.588954,8.84866,515.8,2918.591423,27617730000.0,1643.0,3.617493,1062.0,13.552636,3.0,22.932637,0.109619,39.086531,9.249703,8.794229,60613380000.0,96.7,23253.697699,843.627906,110583600000.0


In [97]:
table.isna().sum()

Variable Name
Agricultural water withdrawal as % of total renewable water resources      7
Agriculture, value added (% GDP)                                           0
Environmental Flow Requirements                                            0
GDP per capita                                                             0
Industry, value added to GDP                                               0
Long-term average annual precipitation in volume                           0
MDG 7.5. Freshwater withdrawal as % of total renewable water resources     5
National Rainfall Index (NRI)                                              7
Population density                                                         0
Prevalence of undernourishment (3-year average)                           83
SDG 6.4.1. Industrial Water Use Efficiency                                 8
SDG 6.4.1. Irrigated Agriculture Water Use Efficiency                     43
SDG 6.4.1. Services Water Use Efficiency                      

In [98]:
table_preprocessed = table.groupby('Area').apply(lambda group: group.interpolate(method='linear',axis=1))
table_preprocessed = table_preprocessed.fillna(method='bfill')
table_preprocessed.head(49)

Unnamed: 0_level_0,Variable Name,Agricultural water withdrawal as % of total renewable water resources,"Agriculture, value added (% GDP)",Environmental Flow Requirements,GDP per capita,"Industry, value added to GDP",Long-term average annual precipitation in volume,MDG 7.5. Freshwater withdrawal as % of total renewable water resources,National Rainfall Index (NRI),Population density,Prevalence of undernourishment (3-year average),SDG 6.4.1. Industrial Water Use Efficiency,SDG 6.4.1. Irrigated Agriculture Water Use Efficiency,SDG 6.4.1. Services Water Use Efficiency,SDG 6.4.1. Water Use Efficiency,SDG 6.4.2. Water Stress,"Services, value added to GDP",Total population with access to safe drinking-water (JMP),Total renewable water resources per capita,Total water withdrawal per capita,Gross Domestic Product (GDP)
Area,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,1992.0,35.498101,54.207106,28.29,248.51889,552849000.0,213.48522,35.91048,304.98,22.187829,29.5,25.31141,0.09214305,10.23717,0.2404004,63.33779,995251000.0,21.3,4510.012947,1619.567,3444711000.0
Afghanistan,1997.0,31.427856,62.416768,28.29,164.092893,422434100.0,213.48522,31.852952,233.0,29.649735,45.6,9.091408,0.08582044,7.763338,0.1945898,56.18125,652758000.0,25.0,3374.984489,1075.032,2924589000.0
Afghanistan,2002.0,30.613807,45.134344,28.29,194.958382,915134500.0,213.48522,31.045462,185.0,34.618096,47.8,11.42629,0.0747696,15.25262,0.285607,54.75702,1465904000.0,33.8,2890.609479,899.0304,4141524000.0
Afghanistan,2007.0,30.613807,35.265942,28.29,389.985586,2599214000.0,213.48522,31.045462,185.0,41.510486,33.3,20.81528,0.110353,26.87668,0.5498465,54.75702,4027325000.0,42.6,2410.653428,751.7527,9412162000.0
Afghanistan,2012.0,30.613807,28.556142,28.29,694.885618,4363018000.0,213.48522,31.045462,185.0,47.730564,24.7,23.22711,0.1038009,47.63956,0.7707739,54.75702,10738430000.0,51.6,2096.505623,653.7869,19136500000.0
Afghanistan,2017.0,30.613807,25.57467,28.29,605.557362,4502380000.0,213.48522,31.045462,185.0,55.595553,22.2,27.85198,0.1189456,56.38479,0.9114308,54.75702,10937780000.0,55.3,1799.917253,561.297,18623030000.0
Afghanistan,2018.0,30.613807,22.440899,28.29,563.983834,4798615000.0,213.48522,31.045462,185.0,56.937048,23.0,31.22006,0.1021315,57.6839,0.9359166,54.75702,10639420000.0,55.3,1757.509385,548.0723,17986970000.0
Argentina,1992.0,2.455948,5.193747,515.8,7366.759988,60791320000.0,1643.0,3.243937,938.371429,12.059174,11.36366,10.66815,9.972639,9.277127,8.581616,7.886104,157832000000.0,94.3,26133.540531,847.7732,247929400000.0
Argentina,1997.0,2.455948,4.527197,515.8,8861.217104,67971350000.0,1643.0,3.343179,887.3,12.824568,21.34046,29.85635,0.08076985,55.78839,12.18445,8.127364,194058000000.0,95.6,24573.84126,822.5845,317532600000.0
Argentina,2002.0,2.588954,8.84866,515.8,2918.591423,27617730000.0,1643.0,3.617493,1062.0,13.552636,3.0,22.93264,0.1096187,39.08653,9.249703,8.794229,60613380000.0,96.7,23253.697699,843.6279,110583600000.0


In [99]:
table_preprocessed.isna().sum()

Variable Name
Agricultural water withdrawal as % of total renewable water resources     0
Agriculture, value added (% GDP)                                          0
Environmental Flow Requirements                                           0
GDP per capita                                                            0
Industry, value added to GDP                                              0
Long-term average annual precipitation in volume                          0
MDG 7.5. Freshwater withdrawal as % of total renewable water resources    0
National Rainfall Index (NRI)                                             0
Population density                                                        0
Prevalence of undernourishment (3-year average)                           0
SDG 6.4.1. Industrial Water Use Efficiency                                0
SDG 6.4.1. Irrigated Agriculture Water Use Efficiency                     0
SDG 6.4.1. Services Water Use Efficiency                                  