In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from collections import defaultdict
from itertools import chain

In [2]:
train = pd.read_csv('data/TrainingSet.csv',index_col=0)
submission = pd.read_csv('data/SubmissionRows.csv',index_col=0)

# Remove [YR****] and input '_' for last 3 cols
train.columns = list(map(lambda x: re.findall(r'\d+',x)[0],train.columns[:36])) + list(map(lambda x: '_'.join(x.split()),train.columns[36:]))

In [9]:
# ===== Information on prediction data ====
sub_index = submission.index

pred_indicators = train.loc[sub_index,:]


# ===== Mapping (Index -> Series Name) ====
mapping = {k:v for k,v in zip(train.index,train.Series_Name)}

In [25]:
# ==== GET TOP CORRELATION OF TARGET INDICATORS WITH LAGGED OBSERVATIONS ===
def get_corr_lag_indicators(frame,country,submission_index):
    assert isinstance(country,str), 'Country_must be a string'
    if country not in frame.Country_Name.unique():
        raise ValueError('No such country in Country_Name index')
    frame_data = frame[frame['Country_Name']==country]
    assert isinstance(submission_index,list), 'submission_index must be a list of integers'
    frame_values = frame_data.loc[:,:'2007'].values
    n_indicators = frame_values.shape[0]
    submission_index_country = list(set(submission_index).intersection(frame_data.index))
    # -1 as default
    lag_corr_mat = np.ones([n_indicators, n_indicators], dtype=np.float64)*-1
    for i in range(n_indicators):
        for j in range(n_indicators):
            if i<j:
                original = frame_values[i,1:]
                shifted = frame_values[j,:-1] 
                non_nan_mask = (~np.isnan(original)&~np.isnan(shifted))
                if non_nan_mask.sum()>=2:
                    lag_corr_mat[i,j] = np.correlate(original[non_nan_mask],shifted[non_nan_mask])
    pred_best_corr_indx = defaultdict(list)
    pred_best_corr_names = defaultdict(list)
    for pred in submission_index_country:
        i = np.where(frame_data.index.values == pred)[0][0]
        pred_name = mapping.get(pred)
        j_max = np.argmax(lag_corr_mat[i,:])
        corr_name =mapping.get(j_max)
        pred_best_corr_indx[pred].append(j_max)   
        pred_best_corr_names[pred_name].append(corr_name)
    return pred_best_corr_indx,pred_best_corr_names

In [28]:
# Corr Names:
corr_list = []
for country in pred_indicators.Country_Name.unique():
    pred_corr_indx,pred_corr_names = get_corr_lag_indicators(train,country,list(sub_index))
    corr_list.append((country,pred_corr_names))

In [33]:
# === SHORT SNIPPET OF LAGGED CORRELATIONS BY COUNTRY === 
corr_list[:5]

[('Afghanistan',
  defaultdict(list,
              {'Reduce child mortality': ['Health expenditure, total (% of GDP)'],
               'Develop a global partnership for development: Internet Use': ['Health expenditure, public (% of government expenditure)'],
               'Combat HIV/AIDS': ['Net domestic credit (current LCU)'],
               'Ensure environmental sustainability': ['Health expenditure, public (% of government expenditure)']})),
 ('Albania',
  defaultdict(list,
              {'Ensure environmental sustainability': [None],
               'Reduce child mortality': ['Progression to secondary school, female (%)'],
               'Develop a global partnership for development: Internet Use': [None]})),
 ('Algeria',
  defaultdict(list,
              {'Ensure environmental sustainability': ['Gross intake ratio in first grade of primary education, total (% of relevant age group)'],
               'Achieve universal primary education': [None],
               'Reduce child morta

In [53]:
dataframes_to_concat = []

for country_info in corr_list:
    country=country_info[0]
    unique_ind = list(set(list(country_info[1].keys()) + list(chain.from_iterable(list(country_info[1].values())))))
    train_country = train[train['Country_Name']==country]
    train_country = train_country[train_country['Series_Name'].isin(unique_ind)]
    dataframes_to_concat.append(train_country)

In [59]:
data_final = pd.concat(dataframes_to_concat)

In [71]:
# Assume 0 implies data wasnt recorded as it doesnt make much sense: Check row 285744 (Zimbabwae) the zeros are surrounded by extremely large values
data_final = data_final.replace({0:np.nan})

In [77]:
# Limit interpolation in both directions up to 3 years:
data_final_numeric = data_final.loc[:,:'2007'].interpolate(limit=3,limit_direction='both',axis=1)
data_final_numeric.head()

Unnamed: 0,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,...,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
511,,,,,,,,,,,...,,6.705402,6.705402,6.705402,6.705402,1.378486,1.85403,1.123186,1.472365,1.375928
513,,,,,,,,,,,...,,8.941534,8.941534,8.941534,8.941534,9.961406,9.786432,9.211441,9.139361,10.021151
559,,,,,,,,,,,...,0.152,0.187,0.221,0.256,0.291,0.325,0.36,0.395,0.43,0.465
618,,,,,,,,,,,...,4.7e-05,4.7e-05,4.7e-05,4.7e-05,4.6e-05,0.000879,0.001058,0.012241,0.021071,0.019
753,0.296,0.2909,0.2852,0.2798,0.2742,0.2683,0.2624,0.2565,0.2503,0.2439,...,0.1391,0.1366,0.1339,0.131,0.1277,0.1244,0.121,0.1177,0.1145,0.1115


In [78]:
# Export:
data_final_numeric.to_csv('data/Training_Cleaned_Corr.csv')

In [7]:
train.loc[285744]

1972                                                          0
1973                                                          0
1974                                                          0
1975                                                          0
1976                                                          0
1977                                                          0
1978                                                 1.0597e+07
1979                                                  4.992e+06
1980                                                 -1.557e+07
1981                                                  3.773e+07
1982                                                1.42861e+08
1983                                                1.54708e+08
1984                                                 5.1633e+07
1985                                                 3.6714e+07
1986                                                  6.766e+06
1987                                    