In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, MultiLabelBinarizer, PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, max_error, mutual_info_score, 
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR, LinearSVR

from typing import Sequence

In [None]:
def reshape(df: pd.DataFrame, scale:bool =False, start=1960, stop=2016) -> pd.DataFrame:
    columns = df["Country Name"]
    index = range(start, stop+1)
    data = df[[f"{i}" for i in index]].to_numpy().T
    
    df = pd.DataFrame(data = data,
                        columns=columns,
                        index=index)
    if scale:
        df = ( df-df.mean() ) / df.std() 
    return df 

In [None]:
filter_ = lambda df, country: df[df["Country Name"].isin(country)]

In [None]:
pop = pd.read_csv("data/country_population.csv")
pop.head()

In [None]:
fert_rate = pd.read_csv("data/fertility_rate.csv")
fert_rate.head()

In [None]:
life_expect = pd.read_csv("data/life_expectancy.csv")
life_expect.head()

In [None]:
countries = list(pop["Country Name"].sample(n=10, random_state=66))
pprint(list(enumerate(countries)))

In [None]:
label = countries[5]
#pop, life_expect, fert_rate
data = pop

In [None]:
reshape(filter_(data, countries)).plot(xlabel="Year", ylabel="___")

In [None]:
reshape(data)[label].iloc[:5]

In [None]:
def to_matrix(X: pd.Series, lags=5):
        new_X = pd.DataFrame( 
            data={ -i: X.shift(i) for i in range(1, lags + 1) }
        ).dropna()
        y = X.loc[new_X.index]
        return pd.concat((new_X, y), axis=1)

In [None]:
to_matrix(reshape(data)[label], lags=10).corr()[label]

In [None]:
class ForecastModel:
    def __init__(self, 
                 estimator = LinearRegression(), 
                 lag : int = 5):
        self._estimator = estimator
        self.lag = lag
    
    def fit(self, X: Sequence, min_train=None, max_train=None):
        self.X, self.y, self.raw = self.to_matrix(X)
        self.max_year = self.X.index.max()
        self.min_year = self.X.index.min()
        if min_train == None: min_train = self.min_year
        if max_train == None: max_train = self.max_year
        self._estimator.fit(self.X.loc[min_train:max_train], self.y.loc[min_train:max_train])
        return self
    
    def predict(self, years: Sequence) -> Sequence:
        X = []
        for year in years:
            assert(year>=self.min_year)
            X += [self.get_lags(year)]

        return self._estimator.predict( X )
    
    def get_lags(self, year: int) -> Sequence:
        return [self.get_year(year - i) for i in range(1,self.lag+1)]
    
    def get_year(self, year: int) -> float:
        try: 
            return self.raw.loc[year]
        except:
            #print(f"Predicting non-existing: {year}")
            pred = self.predict([year])[0]
            self.raw.loc[year] = pred
            return self.raw.loc[year]
        
    def to_matrix(self, X: pd.Series):
        new_X = pd.DataFrame( 
            data={ -i: X.shift(i) for i in range(1, self.lag + 1) }
        ).dropna()
        y = X.loc[new_X.index]
        
        return new_X, y, X

In [None]:
model = ForecastModel(lag=5).fit(reshape(data)[label])

# Predicting on the **Training** Set for a Series of Years

In [None]:
train_pred = model.predict([2016])
real = model.raw.loc[2016]
err = mean_squared_error([real], train_pred, squared=False)
print(f"""Predicted value is: {train_pred[0]:.3f};
True value is: {real:.3f};

Root Mean Squared Error is: {err:.3f}
Error in relation to mean is: {err/model.raw.mean():.3f}""")

In [None]:
min_year = 2000;
max_year = 2016;

In [None]:
preds = model.predict( list(range(min_year, max_year+1)))

In [None]:
def plot_sub(ax, x, y, label="", xlabel="", ylabel="", title=""):
    ax.plot(x, y, label=label)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    if label!="":
        ax.legend()

def scat_sub(ax, x, y, label="", xlabel="", ylabel="", title=""):
    ax.scatter(x, y, label=label)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    if label!="":
        ax.legend()
    
f, ax = plt.subplots(1,2, figsize=(16,6))
plot_sub(ax[0], range(min_year,max_year+1), reshape(data)[label].loc[min_year:max_year], label="True Values")
plot_sub(ax[0], range(min_year,max_year+1), preds, label="Predicted Values", xlabel="Year")

scat_sub(ax[1], preds, reshape(data)[label].loc[min_year:max_year], xlabel="Predicted", ylabel="True")
xy=np.linspace(preds.min(), preds.max())
ax[1].plot(xy,xy, "r")

# Predicting on the **Test** Set for a Series of Years

In [None]:
#real data from 2017 to 2021 in portugal
y_test = pd.read_csv("data/country_population_test.csv", index_col=0)[label]
y_test

In [None]:
preds = model.predict(y_test.index)
preds

In [None]:
print(f"rmse = {mean_squared_error( y_test, preds, squared=False)}")

In [None]:
print(f"Pearson Corr. Coef. = {np.corrcoef(y_test, preds)[0,1]}")

In [None]:
f, ax = plt.subplots(1,2, figsize=(16,6))
plot_sub(ax[0], y_test.index, y_test, label="True Values")
plot_sub(ax[0], y_test.index, preds, label="Predicted Values", xlabel="Year")

scat_sub(ax[1], preds, y_test, xlabel="Predicted", ylabel="True")
xy=np.linspace(min(preds), max(preds))
ax[1].plot(xy,xy, "r")

# Predicting 2017 on the **Test** Set for several countries

In [None]:
y_test=pd.read_csv("data/country_population_test.csv", index_col=0).loc[2017]
y_test

In [None]:
mvts = reshape(data)
preds = [ ForecastModel().fit( mvts[label] ).predict( [2017] )[0] for label in countries]
preds

In [None]:
print(f"rmse = {mean_squared_error( y_test, preds, squared=False)}")

In [None]:
print(f"Pearson Corr. Coef. = {np.corrcoef(y_test, preds)[0,1]}")

In [None]:
x_ = range(len(countries))
f, ax = plt.subplots(1,2, figsize=(16,6))
plot_sub(ax[0], x_, y_test, label="True Values")
plot_sub(ax[0], x_, preds, label="Predicted Values", xlabel="Country")
ax[0].set_xticks(x_, countries, rotation=45)

scat_sub(ax[1], preds, y_test, xlabel="Predicted", ylabel="True")
xy=np.linspace(min(preds), max(preds))
ax[1].plot(xy,xy, "r")