### CRU: Attributes z-score standardization

In [1]:
import numpy as np
import os
import pandas as pd

from pprint import pprint

#### Set input/output folder

In [2]:
data_in         = '../data/CRU_2a_iso_standardization'
data_attr_out   = '../data/CRU_2c_z_score_attr'
data_period_out = '../data/CRU_2c_z_score_year'

if not os.path.exists(data_attr_out):
    os.mkdir(data_attr_out)

if not os.path.exists(data_period_out):
    os.mkdir(data_period_out)

periods = ['annually', 'monthly', 'quarterly']
for period in periods:
    if not os.path.exists(f'{data_attr_out}/{period}'):
        os.mkdir(f'{data_attr_out}/{period}')
    if not os.path.exists(f'{data_period_out}/{period}'):
        os.mkdir(f'{data_period_out}/{period}')

#### Read data

In [3]:
for period in periods:
    # Read data
    all_attrs = dict()

    for attr_file in os.listdir(f'{data_in}/{period}'):
        attr = attr_file.split('.')[0]
        all_attrs[attr] = pd.read_csv(f'{data_in}/{period}/{attr_file}',
                                      index_col='Country')
        all_attrs[attr] = all_attrs[attr].replace([np.inf, -np.inf], np.nan)
        
        # attrs = sorted(list(all_attrs.keys()))
        # print(len(attrs))
        

    # Data transformation; Output data to csv format per year
    mean_var_df = pd.DataFrame(columns=['Mean', 'Variance'])

    for attr in sorted(list(all_attrs.keys())):
    #     print(attr)

        arr   = all_attrs[attr].values  # get all values
        arr   = arr[~np.isnan(arr)]     # remove nan values
        mu    = arr.mean()  # mean (expectation)
        sigma = arr.var()   # variance = std ** 2
        mean_var_df.loc[attr] = [mu, sigma]

        all_attrs[attr] = (all_attrs[attr] - mu) / (np.sqrt(sigma))
        all_attrs[attr].to_csv(f'{data_attr_out}/{period}/{attr}.csv',
                               index_label='Country')

    mean_var_df.to_csv(f'../data/CRU_{period}_mean_var_values.csv',
                       index_label='Attribute')

    # Concatenate attributes by period of time
    df_dict = dict()

    for attr in sorted(list(all_attrs.keys())):
        df = all_attrs[attr]
        for p in df.columns:
            if p not in df_dict.keys():
                df_dict[p] = pd.DataFrame()
            df_dict[p][attr] = df[p]
            
    # Period (Number of countries, Number of attributes)
    for p in df_dict.keys():
#         print(p, df_dict[p].shape)
        df_dict[p].to_csv(f'{data_period_out}/{period}/{p}.csv')

#### Attributes z-score standardization & output to csv format

$x' = \cfrac{x - \mu}{\sigma}$
- $\mu$: mean (expectation)
- $\sigma$: standard deviationm

#### List of all attributes

In [4]:
attrs = sorted(list(all_attrs.keys()))
pprint(attrs)

['Aridity Index (AI)',
 'Cloud Cover (CLD) [percentage]',
 'Diurnal Temperature Range (DTR) [degrees Celsius]',
 'Ground Frost Frequency (FRS) [days]',
 'Maximum Temperature (TMX) [degrees Celsius]',
 'Mean Temperature (TMP) [degrees Celsius]',
 'Minimum Temperature (TMN) [degrees Celsius]',
 'Potential Evapotranspiration (PET) [mm_day]',
 'Potential Evapotranspiration (PET) [mm_month]',
 'Precipitation (PRE) [mm_month]',
 'Rain Days (WET) [days]',
 'Vapour Pressure (VAP) [hPa]']
