# Import climate factors

## Load Modules

In [1]:
import pandas as pd
import numpy as np
import re
import country_converter as coco

## Rainfall

#### Load database

In [2]:
rainfall = pd.read_csv('raw data/WORLDBANK_rainfall.csv',skipinitialspace=True).rename(columns={'Rainfall - (MM)':'Total Rainfall (mm)'})
rainfall.head(12)

Unnamed: 0,Total Rainfall (mm),Year,Statistics,Country
0,64.7765,1991,Jan Average,Afghanistan
1,59.4025,1991,Feb Average,Afghanistan
2,119.625,1991,Mar Average,Afghanistan
3,51.8025,1991,Apr Average,Afghanistan
4,57.2438,1991,May Average,Afghanistan
5,5.58788,1991,Jun Average,Afghanistan
6,4.39142,1991,Jul Average,Afghanistan
7,4.66582,1991,Aug Average,Afghanistan
8,12.6348,1991,Sep Average,Afghanistan
9,4.09568,1991,Oct Average,Afghanistan


#### Set universal country codes

In [3]:
rainfall.replace(to_replace = 'Congo (Republic of the)', value = 'Congo', inplace = True)
rainfall['Country'] = coco.convert(names=rainfall['Country'], to='ISO3')
rainfall.head()

Unnamed: 0,Total Rainfall (mm),Year,Statistics,Country
0,64.7765,1991,Jan Average,AFG
1,59.4025,1991,Feb Average,AFG
2,119.625,1991,Mar Average,AFG
3,51.8025,1991,Apr Average,AFG
4,57.2438,1991,May Average,AFG


#### Clean rainfall

In [4]:
rainfall = rainfall.rename(columns={'Statistics':'Month'})
rainfall['Month'] = rainfall['Month'].map(lambda x: x.split(' ')[0])
rainfall = rainfall[['Country','Year','Month','Total Rainfall (mm)']]
rainfall.head()

Unnamed: 0,Country,Year,Month,Total Rainfall (mm)
0,AFG,1991,Jan,64.7765
1,AFG,1991,Feb,59.4025
2,AFG,1991,Mar,119.625
3,AFG,1991,Apr,51.8025
4,AFG,1991,May,57.2438


#### Annual rainfall

In [5]:
annual_rainfall = rainfall.groupby(['Country','Year']).sum()
annual_rainfall.reset_index(inplace=True)
annual_rainfall.head()

Unnamed: 0,Country,Year,Total Rainfall (mm)
0,AFG,1991,435.4499
1,AFG,1992,408.15623
2,AFG,1993,317.0853
3,AFG,1994,342.22238
4,AFG,1995,300.89815


#### Average annual rainfall period 2013-2017

In [6]:
average_annual_rainfall = annual_rainfall[(annual_rainfall['Year'] > 2012) & (annual_rainfall['Year'] < 2016)].groupby('Country')['Total Rainfall (mm)'].mean()
average_annual_rainfall.head()

Country
AFG     353.323550
AGO     958.376457
ALB    1074.030807
AND     791.158287
ARE      70.675403
Name: Total Rainfall (mm), dtype: float64

## Temperature

#### Load Database

In [7]:
temperature=pd.read_csv('raw data/WORLDBANK_temperature.csv',skipinitialspace=True).rename(columns={'Temperature - (Celsius)':'Temperature (°C)'})
temperature.head()

Unnamed: 0,Temperature (°C),Year,Statistics,Country
0,-0.0311,1991,Jan Average,Afghanistan
1,1.43654,1991,Feb Average,Afghanistan
2,6.88685,1991,Mar Average,Afghanistan
3,12.9397,1991,Apr Average,Afghanistan
4,17.0755,1991,May Average,Afghanistan


#### Set universal country codes

In [8]:
temperature.replace(to_replace = 'Congo (Republic of the)', value = 'Congo', inplace = True)
temperature['Country'] = coco.convert(names=temperature['Country'], to='ISO3')
temperature.head()

Unnamed: 0,Temperature (°C),Year,Statistics,Country
0,-0.0311,1991,Jan Average,AFG
1,1.43654,1991,Feb Average,AFG
2,6.88685,1991,Mar Average,AFG
3,12.9397,1991,Apr Average,AFG
4,17.0755,1991,May Average,AFG


**Clean temperature**

In [9]:
temperature = temperature.rename(columns={'Statistics':'Month'})
temperature = temperature[['Country','Year','Month','Temperature (°C)']]
temperature['Month'] = temperature['Month'].map(lambda x: x.split(' ')[0])
temperature.head()

Unnamed: 0,Country,Year,Month,Temperature (°C)
0,AFG,1991,Jan,-0.0311
1,AFG,1991,Feb,1.43654
2,AFG,1991,Mar,6.88685
3,AFG,1991,Apr,12.9397
4,AFG,1991,May,17.0755


#### Average annual temperature

add weights based on days in a month

In [10]:
weights = {'Jan':31, 'Feb':28, 'Mar':31, 'Apr':30, 'May':31, 'Jun':30,'Jul':31, 'Aug':31, 'Sep':30, 'Oct':31, 'Nov':30, 'Dec':31}
temperature['Weight'] = temperature['Month'].map(weights)
temperature.head()

Unnamed: 0,Country,Year,Month,Temperature (°C),Weight
0,AFG,1991,Jan,-0.0311,31
1,AFG,1991,Feb,1.43654,28
2,AFG,1991,Mar,6.88685,31
3,AFG,1991,Apr,12.9397,30
4,AFG,1991,May,17.0755,31


In [11]:
annual_temperature = temperature.groupby(['Country','Year']).apply(lambda x: (x['Temperature (°C)'] * x['Weight']).sum() / x['Weight'].sum())
annual_temperature = pd.DataFrame(annual_temperature).rename(columns={0:'Temperature (°C)'})
annual_temperature.reset_index(inplace=True)
annual_temperature.head()

Unnamed: 0,Country,Year,Temperature (°C)
0,AFG,1991,12.958152
1,AFG,1992,12.598444
2,AFG,1993,13.016964
3,AFG,1994,13.150314
4,AFG,1995,13.095001


#### Average annual temperature 2013-2017

In [12]:
average_annual_temperature = annual_temperature[(annual_temperature['Year'] > 2012) & (annual_temperature['Year'] < 2017)].groupby("Country")['Temperature (°C)'].mean()
average_annual_temperature.head()

Country
AFG    14.074742
AGO    22.182196
ALB    12.754647
AND    12.402212
ARE    28.010773
Name: Temperature (°C), dtype: float64

## Water inflow

- **Total Internal Renewable Water Resources (IRWR)**: Long-term average annual flow of rivers and recharge of aquifers generated from endogenous precipitation. Double counting of surface water and groundwater resources is avoided by deducting the overlap from the sum of the surface water and groundwater resources. --> should correspond more or less with precipitation data
- **Total External Renewable Water Resources (ERWR)**: That part of the country's long-term average annual renewable water resources that are not generated in the country. It includes inflows from upstream countries (groundwater and surface water), and part of the water of border lakes and/or rivers. ERWR take into account the quantity of flow reserved by upstream (incoming flow) and/or downstream (outflow) countries through formal or informal agreements or treaties. Therefore, it may vary with time. In extreme cases, it may be negative when the flow reserved to downstream countries is more than the incoming flow.
- **Total Renewable Water Resources**: The sum of internal renewable water resources (IRWR) and external renewable water resources (ERWR). It corresponds to the maximum theoretical yearly amount of water available for a country at a given moment.
- **Dependency ratio**: Indicator expressing the percent of total renewable water resources originating outside the country. This indicator may theoretically vary between 0% and 100%. A country with a dependency ratio equal to 0% does not receive any water from neighbouring countries. A country with a dependency ratio equal to 100% receives all its renewable water from upstream countries, without producing any of its own. This indicator does not consider the possible allocation of water to downstream countries. 
- **Total Exploitable Water Resources** (also called manageable water resources or water development potential) are considered to be available for development, taking into consideration factors such as: the economic and environmental feasibility of storing floodwater behind dams, extracting groundwater, the physical possibility of storing water that naturally flows out to the sea, and minimum flow requirements (navigation, environmental services, aquatic life, etc). Methods to assess exploitable water resources vary from country to country. --> maybe not as interesting as this allready takes into account socio-economic factors

#### Load database

In [13]:
water_resources = pd.read_csv('raw data/AQUASTAT_water_resources.csv',nrows=835,index_col=False).rename(columns={'Area':'Country'})
water_resources.head()

Unnamed: 0,Country,Area Id,Variable Name,Variable Id,Year,Value,Symbol,Md
0,Afghanistan,4,Total internal renewable water resources (IRWR),4157,2017,47.15,E,
1,Afghanistan,4,Water resources: total external renewable,4182,2017,18.18,E,
2,Afghanistan,4,Total renewable water resources,4188,2017,65.33,E,
3,Afghanistan,4,Dependency ratio,4192,2017,28.7226,E,
4,Albania,8,Total internal renewable water resources (IRWR),4157,2017,26.9,E,


#### Set universal country codes 

In [14]:
water_resources.replace(to_replace = 'Grenade', value = 'Grenada', inplace = True)
water_resources['Country'] = coco.convert(names=water_resources['Country'], to='ISO3')
water_resources.head()

Unnamed: 0,Country,Area Id,Variable Name,Variable Id,Year,Value,Symbol,Md
0,AFG,4,Total internal renewable water resources (IRWR),4157,2017,47.15,E,
1,AFG,4,Water resources: total external renewable,4182,2017,18.18,E,
2,AFG,4,Total renewable water resources,4188,2017,65.33,E,
3,AFG,4,Dependency ratio,4192,2017,28.7226,E,
4,ALB,8,Total internal renewable water resources (IRWR),4157,2017,26.9,E,


pivot table and rename + sort columns

In [15]:
water_resources = water_resources.pivot(index='Country', columns='Variable Name', values='Value').rename(columns={'Water resources: total external renewable':'Total external renewable water resources (ERWR)'})
water_resources = water_resources[['Total internal renewable water resources (IRWR)','Total external renewable water resources (ERWR)','Total renewable water resources','Dependency ratio','Total exploitable water resources']]
water_resources.head()

Variable Name,Total internal renewable water resources (IRWR),Total external renewable water resources (ERWR),Total renewable water resources,Dependency ratio,Total exploitable water resources
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AFG,47.15,18.18,65.33,28.7226,
AGO,148.0,0.4,148.4,0.269542,
ALB,26.9,3.3,30.2,10.927152,13.0
AND,0.3156,,0.3156,,
ARE,0.15,0.0,0.15,0.0,


## Merge Climate factors - 2013-2017

In [16]:
climate_factors = pd.merge(average_annual_temperature,average_annual_rainfall,left_index=True,right_index=True)
climate_factors = pd.merge(climate_factors,water_resources,left_index=True,right_index=True,how='outer')
climate_factors.reset_index(inplace=True)
climate_factors.head()

Unnamed: 0,Country,Temperature (°C),Total Rainfall (mm),Total internal renewable water resources (IRWR),Total external renewable water resources (ERWR),Total renewable water resources,Dependency ratio,Total exploitable water resources
0,AFG,14.074742,353.32355,47.15,18.18,65.33,28.7226,
1,AGO,22.182196,958.376457,148.0,0.4,148.4,0.269542,
2,ALB,12.754647,1074.030807,26.9,3.3,30.2,10.927152,13.0
3,AND,12.402212,791.158287,0.3156,,0.3156,,
4,ARE,28.010773,70.675403,0.15,0.0,0.15,0.0,


In [17]:
climate_factors.isna().sum()

Country                                              0
Temperature (°C)                                     7
Total Rainfall (mm)                                  7
Total internal renewable water resources (IRWR)     19
Total external renewable water resources (ERWR)      7
Total renewable water resources                      5
Dependency ratio                                     9
Total exploitable water resources                  138
dtype: int64

#### export to csv

In [18]:
climate_factors.to_csv('clean data/climate_factors.csv')

## Left-overs

In [19]:
print(len(set(average_annual_temperature.index.tolist()) - set(average_annual_rainfall.index.tolist())))
set(average_annual_temperature.index.tolist()) - set(average_annual_rainfall.index.tolist())
# all names here match :-)

0


set()

In [20]:
print(len(climate_factors),len(average_annual_rainfall),len(average_annual_temperature))

202 195 195


In [21]:
climate_factors.index

RangeIndex(start=0, stop=202, step=1)

In [22]:
water_resources.index

Index(['AFG', 'AGO', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT',
       ...
       'VAT', 'VCT', 'VEN', 'VNM', 'VUT', 'WSM', 'YEM', 'ZAF', 'ZMB', 'ZWE'],
      dtype='object', name='Country', length=197)

In [23]:
climate_factors1 = pd.merge(climate_factors,water_resources,left_index=True,right_index=True,how='outer')
climate_factors1.head()

Unnamed: 0,Country,Temperature (°C),Total Rainfall (mm),Total internal renewable water resources (IRWR)_x,Total external renewable water resources (ERWR)_x,Total renewable water resources_x,Dependency ratio_x,Total exploitable water resources_x,Total internal renewable water resources (IRWR)_y,Total external renewable water resources (ERWR)_y,Total renewable water resources_y,Dependency ratio_y,Total exploitable water resources_y
0,AFG,14.074742,353.32355,47.15,18.18,65.33,28.7226,,,,,,
1,AGO,22.182196,958.376457,148.0,0.4,148.4,0.269542,,,,,,
2,ALB,12.754647,1074.030807,26.9,3.3,30.2,10.927152,13.0,,,,,
3,AND,12.402212,791.158287,0.3156,,0.3156,,,,,,,
4,ARE,28.010773,70.675403,0.15,0.0,0.15,0.0,,,,,,


In [24]:
len(set(climate_factors.index.tolist()) - set(water_resources.index.tolist()))
set(climate_factors.index.tolist()) - set(water_resources.index.tolist())
# 32 countries with different names, shown here

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [25]:
namematches = list(set(climate_factors.index.tolist()) & set(water_resources.index.tolist()))

In [26]:
climate_factors2 = pd.merge(climate_factors.loc[namematches],water_resources.loc[namematches],left_index=True,right_index=True,how='outer')
climate_factors2.head()

Unnamed: 0,Country,Temperature (°C),Total Rainfall (mm),Total internal renewable water resources (IRWR)_x,Total external renewable water resources (ERWR)_x,Total renewable water resources_x,Dependency ratio_x,Total exploitable water resources_x,Total internal renewable water resources (IRWR)_y,Total external renewable water resources (ERWR)_y,Total renewable water resources_y,Dependency ratio_y,Total exploitable water resources_y


In [27]:
print(len(climate_factors1))
print(len(climate_factors2))
print(len(climate_factors))

399
0
202


In [28]:
climate_factors2.isnull().sum() # no missingness anywhere


Country                                              0.0
Temperature (°C)                                     0.0
Total Rainfall (mm)                                  0.0
Total internal renewable water resources (IRWR)_x    0.0
Total external renewable water resources (ERWR)_x    0.0
Total renewable water resources_x                    0.0
Dependency ratio_x                                   0.0
Total exploitable water resources_x                  0.0
Total internal renewable water resources (IRWR)_y    0.0
Total external renewable water resources (ERWR)_y    0.0
Total renewable water resources_y                    0.0
Dependency ratio_y                                   0.0
Total exploitable water resources_y                  0.0
dtype: float64

In [29]:
diffset1 = np.setdiff1d(climate_factors.index,water_resources.index) # elements in climate not in water
print(diffset1)

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201]


  mask &= (ar1 != a)


In [30]:
diffset2 = np.setdiff1d(water_resources.index,climate_factors.index) # elements in water not in climate
print(diffset2)

['AFG' 'AGO' 'ALB' 'AND' 'ARE' 'ARG' 'ARM' 'ATG' 'AUS' 'AUT' 'AZE' 'BDI'
 'BEL' 'BEN' 'BFA' 'BGD' 'BGR' 'BHR' 'BHS' 'BIH' 'BLR' 'BLZ' 'BOL' 'BRA'
 'BRB' 'BRN' 'BTN' 'BWA' 'CAF' 'CAN' 'CHE' 'CHL' 'CHN' 'CIV' 'CMR' 'COD'
 'COG' 'COK' 'COL' 'COM' 'CPV' 'CRI' 'CUB' 'CYP' 'CZE' 'DEU' 'DJI' 'DMA'
 'DNK' 'DOM' 'DZA' 'ECU' 'EGY' 'ERI' 'ESP' 'EST' 'ETH' 'FIN' 'FJI' 'FRA'
 'FRO' 'FSM' 'GAB' 'GBR' 'GEO' 'GHA' 'GIN' 'GMB' 'GNB' 'GNQ' 'GRC' 'GRD'
 'GTM' 'GUY' 'HND' 'HRV' 'HTI' 'HUN' 'IDN' 'IND' 'IRL' 'IRN' 'IRQ' 'ISL'
 'ISR' 'ITA' 'JAM' 'JOR' 'JPN' 'KAZ' 'KEN' 'KGZ' 'KHM' 'KIR' 'KNA' 'KOR'
 'KWT' 'LAO' 'LBN' 'LBR' 'LBY' 'LCA' 'LIE' 'LKA' 'LSO' 'LTU' 'LUX' 'LVA'
 'MAR' 'MDA' 'MDG' 'MDV' 'MEX' 'MHL' 'MKD' 'MLI' 'MLT' 'MMR' 'MNG' 'MOZ'
 'MRT' 'MUS' 'MWI' 'MYS' 'NAM' 'NER' 'NGA' 'NIC' 'NIU' 'NLD' 'NOR' 'NPL'
 'NRU' 'NZL' 'OMN' 'PAK' 'PAN' 'PER' 'PHL' 'PLW' 'PNG' 'POL' 'PRI' 'PRK'
 'PRT' 'PRY' 'PSE' 'QAT' 'ROU' 'RUS' 'RWA' 'SAU' 'SDN' 'SEN' 'SGP' 'SLB'
 'SLE' 'SLV' 'SOM' 'SRB' 'SSD' 'STP' 'SUR' 'SVK' 'S

In [31]:
diffset2[0]

'AFG'

In [32]:
test = re.sub("[\(\[].*?[\)\]]", "", diffset2[0])

In [33]:
re.sub("[ \t]+$","",test)

'AFG'

In [34]:
for country in range(len(diffset2)):
  temp1 = re.sub("[\(\[].*?[\)\]]", "", diffset2[country])
  diffset2[country] = re.sub("[ \t]+$","",temp1)
    
for country in range(len(diffset1)):
  temp1 = re.sub("[\(\[].*?[\)\]]", "", diffset1[country])
  diffset1[country] = re.sub("[ \t]+$","",temp1)

TypeError: cannot use a string pattern on a bytes-like object

In [None]:
np.setdiff1d(diffset1,diffset2) # elements in climate not in water
