## Libraries

In [None]:
import sys
import os
import datetime as dt
import numpy as np
import pandas as pd
import geopandas as gpd
import altair as alt
from vega_datasets import data

In [None]:
if not 'mainDir' in globals():
    mainDir = os.path.dirname(os.getcwd()) # Get parent dir: os.path.dirname()
print(mainDir)

In [None]:
dir_input = os.path.join(mainDir,'data','raw')
dir_output = os.path.join(mainDir,'data','processed')

## Settings

In [None]:
first_relevant_year = 1850
last_relevant_year = 2019

## Prepare Berkeley data

In [None]:
data_source = 'berkeley'

In [None]:
list_relevant_iso = [
    'CAN',
    'USA',
    'NIC',
    'BRA',
    'ARG',
    'GRL',
    'ISL',
    'DEU',
    'ESP',
    'EGY',
    'MRT',
    'MDG',
    'ZAF',
    'COD',
    'SAU',
    'RUS',
    'CHN',
    'JPN',
    'IND',
    'IDN',
    'AUS',
    'NZL',
]

### Read shapes

In [None]:
gdf = gpd.read_file(os.path.join(dir_input,"countries.geojson"))
gdf.shape

In [None]:
gdf = gdf.rename(columns={'ADMIN':'Country'})
if len(list_relevant_iso) > 0:
    gdf = gdf.loc[gdf['ISO_A3'].isin(list_relevant_iso)]
gdf.shape

In [None]:
list_relevant_countries = gdf['Country'].unique().tolist()

### Read climate data

In [None]:
df = pd.read_csv(os.path.join(dir_input,f"{data_source}_regional_average_temperatures","GlobalLandTemperaturesByCountry.csv"))
df.shape

In [None]:
df = df[['dt','AverageTemperature','Country']]

In [None]:
df['dt'] = pd.to_datetime(df['dt'], format="%Y-%m-%d")
df['Year'] = df['dt'].dt.year
df['Month'] = df['dt'].dt.month
df = df.loc[(df['Year']>=first_relevant_year) & (df['Year']<=last_relevant_year)]
df.shape
df.head(3)

In [None]:
country_names_dict = {
    'Antigua And Barbuda':'Antigua and Barbuda',
    'Bahamas':'The Bahamas',
    'Bosnia And Herzegovina':'Bosnia and Herzegovina',
    'Burma':'Myanmar',
    'Congo':'Republic of Congo',
    'Congo (Democratic Republic Of The)':'Democratic Republic of the Congo',
    "Côte D'Ivoire":'Ivory Coast',
    'Falkland Islands (Islas Malvinas)':'Falkland Islands',
    'Federated States Of Micronesia':'Federated States of Micronesia',
    'French Southern And Antarctic Lands':'French Southern and Antarctic Lands',
    'Heard Island And Mcdonald Islands':'Heard Island and McDonald Islands',
    'Hong Kong':'Hong Kong S.A.R.',
    'Isle Of Man':'Isle of Man',
    'Macau':'Macao S.A.R',
    'Palestina':'Palestine',
    'Saint Barthélemy':'Saint Barthelemy',
    'Saint Kitts And Nevis':'Saint Kitts and Nevis',
    'Saint Pierre And Miquelon':'Saint Pierre and Miquelon',
    'Saint Vincent And The Grenadines':'Saint Vincent and the Grenadines',
    'Sao Tome And Principe':'Sao Tome and Principe',
    'Serbia':'Republic of Serbia',
    'South Georgia And The South Sandwich Isla':'South Georgia and South Sandwich Islands',
    'Tanzania':'United Republic of Tanzania',
    'Timor Leste':'East Timor',
    'Trinidad And Tobago':'Trinidad and Tobago',
    'Turks And Caicas Islands':'Turks and Caicos Islands',
    'United States':'United States of America',
    'Virgin Islands':'United States Virgin Islands',
    'Åland':'Aland'
}

In [None]:
for key in country_names_dict.keys():
    #print(key, country_names_dict[key])
    df.loc[df['Country']==key,'Country'] = country_names_dict[key]

In [None]:
if len(list_relevant_countries) > 0:
    df = df.loc[df['Country'].isin(list_relevant_countries)]
df.shape

In [None]:
df_berkeley = df[['Country','Year','Month','AverageTemperature']].copy()

## Prepare DWD data

In [None]:
data_source = 'dwd'

In [None]:
first_relevant_year = 1850

In [None]:
list_relevant_iso = [
    'DEU',
]

### Read climate data

In [None]:
for iter_month in range(1,13):
    df_temp = pd.read_csv(
        os.path.join(dir_input,f"{data_source}_regional_average_temperatures",f"regional_averages_tm_{str(iter_month).zfill(2)}.txt"),
        skiprows=1, sep=";", skipinitialspace=True, 
    )
    df_temp = df_temp[['Jahr','Monat','Deutschland']].rename(columns={'Jahr':'Year','Monat':'Month','Deutschland':'AverageTemperature'})
    if iter_month == 1:
        df = df_temp.copy()
    else:
        df = df.append(df_temp, ignore_index=True)
df['Country'] = 'Germany'
df

In [None]:
df[['Year','Month']] = df[['Year','Month']].astype(int)
df = df.loc[(df['Year']>=first_relevant_year) & (df['Year']<=last_relevant_year)].copy()
df['AverageTemperature'] = df['AverageTemperature'].astype(float)
df

In [None]:
df_dwd = df[['Country','Year','Month','AverageTemperature']].sort_values(['Country','Year','Month'], ascending=[True,True,True])

In [None]:
df_final = df_berkeley.merge(df_dwd, on=['Country','Year','Month'], how='outer', suffixes=['','_temp'])
df_final = df_final.sort_values(['Country','Year','Month'], ascending=[True,True,True])
df_final

In [None]:
df_final.loc[
    df_final['AverageTemperature_temp'].notnull(),
    'AverageTemperature'
] = df_final.loc[
    df_final['AverageTemperature_temp'].notnull(),
    'AverageTemperature_temp'
]

## Write preprocessed data

In [None]:
df_final.to_pickle(os.path.join(dir_output,"regional_average_temperatures.pkl"))