# Import climate factors

## Load Modules

In [1]:
import pandas as pd
import numpy as np
import re
import country_converter as coco

## Rainfall

#### Load database

In [2]:
rainfall = pd.read_csv('raw data/WORLDBANK_rainfall.csv',skipinitialspace=True).rename(columns={'Rainfall - (MM)':'Total Rainfall (mm)'})
rainfall.head(12)

Unnamed: 0,Total Rainfall (mm),Year,Statistics,Country
0,64.7765,1991,Jan Average,Afghanistan
1,59.4025,1991,Feb Average,Afghanistan
2,119.625,1991,Mar Average,Afghanistan
3,51.8025,1991,Apr Average,Afghanistan
4,57.2438,1991,May Average,Afghanistan
5,5.58788,1991,Jun Average,Afghanistan
6,4.39142,1991,Jul Average,Afghanistan
7,4.66582,1991,Aug Average,Afghanistan
8,12.6348,1991,Sep Average,Afghanistan
9,4.09568,1991,Oct Average,Afghanistan


#### Set universal country codes

In [3]:
rainfall.replace(to_replace = 'Congo (Republic of the)', value = 'Congo', inplace = True)
rainfall['Country'] = coco.convert(names=rainfall['Country'], to='ISO3')
rainfall.head()

Unnamed: 0,Total Rainfall (mm),Year,Statistics,Country
0,64.7765,1991,Jan Average,AFG
1,59.4025,1991,Feb Average,AFG
2,119.625,1991,Mar Average,AFG
3,51.8025,1991,Apr Average,AFG
4,57.2438,1991,May Average,AFG


#### Monthly rainfall

In [4]:
monthly_rainfall = rainfall[['Country','Year','Statistics','Total Rainfall (mm)']]
monthly_rainfall = monthly_rainfall.rename(columns={'Statistics':'Month'})
monthly_rainfall['Month'] = monthly_rainfall['Month'].map(lambda x: x.split(' ')[0])
monthly_rainfall.set_index(['Country','Year','Month'],inplace=True)
monthly_rainfall.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Total Rainfall (mm)
Country,Year,Month,Unnamed: 3_level_1
AFG,1991,Jan,64.7765
AFG,1991,Feb,59.4025
AFG,1991,Mar,119.625
AFG,1991,Apr,51.8025
AFG,1991,May,57.2438


#### Annual rainfall

In [5]:
annual_rainfall = monthly_rainfall.groupby(['Country','Year']).sum()
annual_rainfall.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Rainfall (mm)
Country,Year,Unnamed: 2_level_1
AFG,1991,435.4499
AFG,1992,408.15623
AFG,1993,317.0853
AFG,1994,342.22238
AFG,1995,300.89815


#### Average annual rainfall last 25 years

In [6]:
average_annual_rainfall = annual_rainfall.groupby(['Country']).mean()
average_annual_rainfall["Standard Deviation"] = annual_rainfall.groupby("Country").std()
average_annual_rainfall.head()

Unnamed: 0_level_0,Total Rainfall (mm),Standard Deviation
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
AFG,322.119242,56.667164
AGO,966.806905,70.556696
ALB,992.182656,150.829612
AND,757.658275,127.729671
ARE,56.646966,30.300352


## Temperature

#### Load Database

In [7]:
temperature=pd.read_csv('raw data/WORLDBANK_temperature.csv',skipinitialspace=True).rename(columns={'Temperature - (Celsius)':'Temperature (°C)'})
temperature.head()

Unnamed: 0,Temperature (°C),Year,Statistics,Country
0,-0.0311,1991,Jan Average,Afghanistan
1,1.43654,1991,Feb Average,Afghanistan
2,6.88685,1991,Mar Average,Afghanistan
3,12.9397,1991,Apr Average,Afghanistan
4,17.0755,1991,May Average,Afghanistan


#### Set universal country codes

In [8]:
temperature.replace(to_replace = 'Congo (Republic of the)', value = 'Congo', inplace = True)
temperature['Country'] = coco.convert(names=temperature['Country'], to='ISO3')
temperature.head()

Unnamed: 0,Temperature (°C),Year,Statistics,Country
0,-0.0311,1991,Jan Average,AFG
1,1.43654,1991,Feb Average,AFG
2,6.88685,1991,Mar Average,AFG
3,12.9397,1991,Apr Average,AFG
4,17.0755,1991,May Average,AFG


#### Monthly Temperature

add weights based on days in a month

In [9]:
monthly_temperature = temperature[['Country','Year','Statistics','Temperature (°C)']]
monthly_temperature = monthly_temperature.rename(columns={'Statistics':'Month'})
monthly_temperature['Month'] = monthly_temperature['Month'].map(lambda x: x.split(' ')[0])
weights = {'Jan':31, 'Feb':28, 'Mar':31, 'Apr':30, 'May':31, 'Jun':30,'Jul':31, 'Aug':31, 'Sep':30, 'Oct':31, 'Nov':30, 'Dec':31}
monthly_temperature['Weight'] = monthly_temperature['Month'].map(weights)
monthly_temperature.set_index(['Country','Year','Month'],inplace=True)
monthly_temperature.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Temperature (°C),Weight
Country,Year,Month,Unnamed: 3_level_1,Unnamed: 4_level_1
AFG,1991,Jan,-0.0311,31
AFG,1991,Feb,1.43654,28
AFG,1991,Mar,6.88685,31
AFG,1991,Apr,12.9397,30
AFG,1991,May,17.0755,31


#### Average annual temperature

In [10]:
annual_temperature = monthly_temperature.groupby(['Country','Year']).apply(lambda x: (x['Temperature (°C)'] * x['Weight']).sum() / x['Weight'].sum())
annual_temperature = pd.DataFrame(annual_temperature, columns = ["Average Temperature (°C)"])
annual_temperature.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Average Temperature (°C)
Country,Year,Unnamed: 2_level_1
AFG,1991,12.958152
AFG,1992,12.598444
AFG,1993,13.016964
AFG,1994,13.150314
AFG,1995,13.095001


#### Average annual temperature last 25 years

In [11]:
average_annual_temperature = annual_temperature.groupby("Country").mean()
average_annual_temperature["Standard Deviation"] = annual_temperature.groupby("Country").std()
average_annual_temperature.head()

Unnamed: 0_level_0,Average Temperature (°C),Standard Deviation
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
AFG,13.604332,0.521565
AGO,22.126307,0.293815
ALB,12.156846,0.532746
AND,11.998357,0.503757
ARE,27.596885,0.517522


## Water inflow

- **Total Internal Renewable Water Resources (IRWR)**: Long-term average annual flow of rivers and recharge of aquifers generated from endogenous precipitation. Double counting of surface water and groundwater resources is avoided by deducting the overlap from the sum of the surface water and groundwater resources. --> should correspond more or less with precipitation data
- **Total External Renewable Water Resources (ERWR)**: That part of the country's long-term average annual renewable water resources that are not generated in the country. It includes inflows from upstream countries (groundwater and surface water), and part of the water of border lakes and/or rivers. ERWR take into account the quantity of flow reserved by upstream (incoming flow) and/or downstream (outflow) countries through formal or informal agreements or treaties. Therefore, it may vary with time. In extreme cases, it may be negative when the flow reserved to downstream countries is more than the incoming flow.
- **Total Renewable Water Resources**: The sum of internal renewable water resources (IRWR) and external renewable water resources (ERWR). It corresponds to the maximum theoretical yearly amount of water available for a country at a given moment.
- **Dependency ratio**: Indicator expressing the percent of total renewable water resources originating outside the country. This indicator may theoretically vary between 0% and 100%. A country with a dependency ratio equal to 0% does not receive any water from neighbouring countries. A country with a dependency ratio equal to 100% receives all its renewable water from upstream countries, without producing any of its own. This indicator does not consider the possible allocation of water to downstream countries. 
- **Total Exploitable Water Resources** (also called manageable water resources or water development potential) are considered to be available for development, taking into consideration factors such as: the economic and environmental feasibility of storing floodwater behind dams, extracting groundwater, the physical possibility of storing water that naturally flows out to the sea, and minimum flow requirements (navigation, environmental services, aquatic life, etc). Methods to assess exploitable water resources vary from country to country. --> maybe not as interesting as this allready takes into account socio-economic factors

#### Load database

In [12]:
water_resources = pd.read_csv('raw data/AQUASTAT_water_resources.csv',nrows=835,index_col=False).rename(columns={'Area':'Country'})
water_resources.head()

Unnamed: 0,Country,Area Id,Variable Name,Variable Id,Year,Value,Symbol,Md
0,Afghanistan,4,Total internal renewable water resources (IRWR),4157,2017,47.15,E,
1,Afghanistan,4,Water resources: total external renewable,4182,2017,18.18,E,
2,Afghanistan,4,Total renewable water resources,4188,2017,65.33,E,
3,Afghanistan,4,Dependency ratio,4192,2017,28.7226,E,
4,Albania,8,Total internal renewable water resources (IRWR),4157,2017,26.9,E,


#### Set universal country codes 

In [13]:
water_resources.replace(to_replace = 'Grenade', value = 'Grenada', inplace = True)
water_resources['Country'] = coco.convert(names=water_resources['Country'], to='ISO3')
water_resources.head()

Unnamed: 0,Country,Area Id,Variable Name,Variable Id,Year,Value,Symbol,Md
0,AFG,4,Total internal renewable water resources (IRWR),4157,2017,47.15,E,
1,AFG,4,Water resources: total external renewable,4182,2017,18.18,E,
2,AFG,4,Total renewable water resources,4188,2017,65.33,E,
3,AFG,4,Dependency ratio,4192,2017,28.7226,E,
4,ALB,8,Total internal renewable water resources (IRWR),4157,2017,26.9,E,


pivot table and rename + sort columns

In [14]:
water_resources = water_resources.pivot(index='Country', columns='Variable Name', values='Value').rename(columns={'Water resources: total external renewable':'Total external renewable water resources (ERWR)'})
water_resources = water_resources[['Total internal renewable water resources (IRWR)','Total external renewable water resources (ERWR)','Total renewable water resources','Dependency ratio','Total exploitable water resources']]
water_resources.head()

Variable Name,Total internal renewable water resources (IRWR),Total external renewable water resources (ERWR),Total renewable water resources,Dependency ratio,Total exploitable water resources
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AFG,47.15,18.18,65.33,28.7226,
AGO,148.0,0.4,148.4,0.269542,
ALB,26.9,3.3,30.2,10.927152,13.0
AND,0.3156,,0.3156,,
ARE,0.15,0.0,0.15,0.0,


## Merge Climate factors

In [15]:
climate_factors = pd.merge(average_annual_temperature,average_annual_rainfall,left_index=True,right_index=True)
climate_factors = pd.merge(climate_factors,water_resources,left_index=True,right_index=True,how='outer')
climate_factors.drop(['Standard Deviation_x','Standard Deviation_y'],axis=1,inplace=True)
climate_factors.head()

Unnamed: 0_level_0,Average Temperature (°C),Total Rainfall (mm),Total internal renewable water resources (IRWR),Total external renewable water resources (ERWR),Total renewable water resources,Dependency ratio,Total exploitable water resources
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AFG,13.604332,322.119242,47.15,18.18,65.33,28.7226,
AGO,22.126307,966.806905,148.0,0.4,148.4,0.269542,
ALB,12.156846,992.182656,26.9,3.3,30.2,10.927152,13.0
AND,11.998357,757.658275,0.3156,,0.3156,,
ARE,27.596885,56.646966,0.15,0.0,0.15,0.0,


In [16]:
climate_factors.isna().sum()

Average Temperature (°C)                             7
Total Rainfall (mm)                                  7
Total internal renewable water resources (IRWR)     19
Total external renewable water resources (ERWR)      7
Total renewable water resources                      5
Dependency ratio                                     9
Total exploitable water resources                  138
dtype: int64

#### export to csv

In [17]:
climate_factors.to_csv('clean data/climate_factors.csv')

## Left-overs

In [18]:
print(len(set(average_annual_temperature.index.tolist()) - set(average_annual_rainfall.index.tolist())))
set(average_annual_temperature.index.tolist()) - set(average_annual_rainfall.index.tolist())
# all names here match :-)

0


set()

In [19]:
print(len(climate_factors),len(average_annual_rainfall),len(average_annual_temperature))

202 195 195


In [20]:
climate_factors.index

Index(['AFG', 'AGO', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT',
       ...
       'VAT', 'VCT', 'VEN', 'VNM', 'VUT', 'WSM', 'YEM', 'ZAF', 'ZMB', 'ZWE'],
      dtype='object', name='Country', length=202)

In [21]:
water_resources.index

Index(['AFG', 'AGO', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT',
       ...
       'VAT', 'VCT', 'VEN', 'VNM', 'VUT', 'WSM', 'YEM', 'ZAF', 'ZMB', 'ZWE'],
      dtype='object', name='Country', length=197)

In [22]:
climate_factors1 = pd.merge(climate_factors,water_resources,left_index=True,right_index=True,how='outer')
climate_factors1.head()

Unnamed: 0_level_0,Average Temperature (°C),Total Rainfall (mm),Total internal renewable water resources (IRWR)_x,Total external renewable water resources (ERWR)_x,Total renewable water resources_x,Dependency ratio_x,Total exploitable water resources_x,Total internal renewable water resources (IRWR)_y,Total external renewable water resources (ERWR)_y,Total renewable water resources_y,Dependency ratio_y,Total exploitable water resources_y
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AFG,13.604332,322.119242,47.15,18.18,65.33,28.7226,,47.15,18.18,65.33,28.7226,
AGO,22.126307,966.806905,148.0,0.4,148.4,0.269542,,148.0,0.4,148.4,0.269542,
ALB,12.156846,992.182656,26.9,3.3,30.2,10.927152,13.0,26.9,3.3,30.2,10.927152,13.0
AND,11.998357,757.658275,0.3156,,0.3156,,,0.3156,,0.3156,,
ARE,27.596885,56.646966,0.15,0.0,0.15,0.0,,0.15,0.0,0.15,0.0,


In [23]:
len(set(climate_factors.index.tolist()) - set(water_resources.index.tolist()))
set(climate_factors.index.tolist()) - set(water_resources.index.tolist())
# 32 countries with different names, shown here

{'GRL', 'MCO', 'MNE', 'MNP', 'NCL'}

In [24]:
namematches = list(set(climate_factors.index.tolist()) & set(water_resources.index.tolist()))

In [25]:
climate_factors2 = pd.merge(climate_factors.loc[namematches],water_resources.loc[namematches],left_index=True,right_index=True,how='outer')
climate_factors2.head()

Unnamed: 0_level_0,Average Temperature (°C),Total Rainfall (mm),Total internal renewable water resources (IRWR)_x,Total external renewable water resources (ERWR)_x,Total renewable water resources_x,Dependency ratio_x,Total exploitable water resources_x,Total internal renewable water resources (IRWR)_y,Total external renewable water resources (ERWR)_y,Total renewable water resources_y,Dependency ratio_y,Total exploitable water resources_y
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ATG,26.163484,2566.520596,0.052,0.0,0.052,0.0,,0.052,0.0,0.052,0.0,
COL,24.730581,2652.839627,2145.0,215.0,2360.0,9.110169,,2145.0,215.0,2360.0,9.110169,
VEN,25.86261,1960.869019,805.0,520.0,1325.0,39.245283,,805.0,520.0,1325.0,39.245283,
LBR,25.706394,2486.609153,200.0,32.0,232.0,13.793103,,200.0,32.0,232.0,13.793103,
AUT,7.25085,1179.828113,55.0,22.7,77.7,29.214929,93.0,55.0,22.7,77.7,29.214929,93.0


In [26]:
print(len(climate_factors1))
print(len(climate_factors2))
print(len(climate_factors))

202
197
202


In [27]:
climate_factors2.isnull().sum() # no missingness anywhere


Average Temperature (°C)                               7
Total Rainfall (mm)                                    7
Total internal renewable water resources (IRWR)_x     14
Total external renewable water resources (ERWR)_x      2
Total renewable water resources_x                      0
Dependency ratio_x                                     4
Total exploitable water resources_x                  133
Total internal renewable water resources (IRWR)_y     14
Total external renewable water resources (ERWR)_y      2
Total renewable water resources_y                      0
Dependency ratio_y                                     4
Total exploitable water resources_y                  133
dtype: int64

In [28]:
diffset1 = np.setdiff1d(climate_factors.index,water_resources.index) # elements in climate not in water
print(diffset1)

['GRL' 'MCO' 'MNE' 'MNP' 'NCL']


In [29]:
diffset2 = np.setdiff1d(water_resources.index,climate_factors.index) # elements in water not in climate
print(diffset2)

[]


In [30]:
diffset2[0]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
test = re.sub("[\(\[].*?[\)\]]", "", diffset2[0])

In [None]:
re.sub("[ \t]+$","",test)

In [None]:
for country in range(len(diffset2)):
  temp1 = re.sub("[\(\[].*?[\)\]]", "", diffset2[country])
  diffset2[country] = re.sub("[ \t]+$","",temp1)
    
for country in range(len(diffset1)):
  temp1 = re.sub("[\(\[].*?[\)\]]", "", diffset1[country])
  diffset1[country] = re.sub("[ \t]+$","",temp1)

In [None]:
np.setdiff1d(diffset1,diffset2) # elements in climate not in water
