In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, max_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.preprocessing import StandardScaler

from typing import Sequence

In [None]:
"""
Parameters:
    df: 'dataframe' structured like in csv files
    scale: 'bool', request to standardize data, default=False
    start: 'int', representing the year where to start the timeseries
    stop: 'int', representing the year where to stop the timeseries

Output:
    DataFrame with the follwing structure:
        -> data   = values
        -> columns= country_name
        -> index  = year

Description:
    Basicly creates a multivariate time-series
"""
def reshape(df: pd.DataFrame, scale:bool =False, start=1960, stop=2016) -> pd.DataFrame:
    columns = df["Country Name"]
    index = range(start, stop+1)
    data = df[[f"{i}" for i in index]].to_numpy().T
    
    df = pd.DataFrame(data = data,
                        columns=columns,
                        index=index)
    if scale:
        df = ( df-df.mean() ) / df.std() 
    return df

In [None]:
filter_ = lambda df, country: df[df["Country Name"].isin(country)]

In [None]:
pop = pd.read_csv("data/country_population.csv")
pop.head()

In [None]:
reshape(pop).head()

In [None]:
"""
Parameters:
    df: 'dataframe' structured like "reshape" function output, multivariate timeseries
    label: 'str', name for the column of the data

Description:
    Agglomerates all the different timeseries for each country in a single column.
    Each row as the country it refers to, one hot encoded.
    The data for each country is still ordered by year.
"""
def to_mvts(df: pd.DataFrame, label:str):
    encoder = LabelBinarizer().fit(df.columns)
    new_df = pd.DataFrame(columns = [label] + list(df.columns))
    
    for i, col in enumerate(df.columns):
        tseries = df[col]
        tseries.name = label
        #one hot encodes country
        encoded_country = pd.DataFrame(data=list(encoder.transform([col]))*len(tseries), 
                                       columns=encoder.classes_,
                                       index=tseries.index)
        #merges country timeseries with respective one hot encoding
        tseries = pd.concat((tseries, encoded_country), axis=1)
        new_df = pd.concat((new_df, tseries), sort=True)
    return new_df

In [None]:
#multivariate_timeseries = to_mvts(reshape(pop), "Population")
multivariate_timeseries = pd.read_csv("data/mvts.csv", index_col=0)

multivariate_timeseries.head()

In [None]:
multivariate_timeseries.to_csv("data/mvts.csv")

In [None]:
"""
Class for the prediction of future values of a time-series;
Initalization Parameters:
    -> estimator: ML model to use in the forecast regression. !Must be a model from 'Sklearn' package!
    -> lags: 'int', how many previous years to take in consideration to make the prediction.

Non-initial Instances:
    -> raw: Default data that entered the model (output of 'reshape()' format)
    -> X: Independent variables used to train the model. Matrix that for each row/year as the *lags*
            previous years, as well as the one hot encoded country it belongs to.
    -> y: Dependent variable used to train the model. Time-series with the expected output
            for each element in X.
    -> min_year: The minimum year for which the model can make a prediction. Corresponds to 
                {year of the first entrance} + {lags}
    -> encoder: Model to one hot encode the country labels. Corresponds to sklearn 'LabelBinarizer' model

"""
class ForecastModel:
    def __init__(self, 
                 estimator = LinearRegression(), 
                 lags=5):
        self.estimator=estimator
        self.lags = lags
    
    """
    Parameters:
        -> X: MVTS to train the model. Should be in format of "reshape" function output
        
    Description:
        Using input X mvts creates an instance 'y', and an instance 'X' (mentioned in "Non-initial Instances").
        Then saves the first year for which the model can make predictions ('min_year').
        Finally trains the estimator with the matrix data 'X' as the independent variables and
        'y' series as the dependent variable.
    """
    def fit(self, X):
        self.raw = X
        mvts = self.__to_lagged_mvts(X)
        self.X = mvts.drop(columns="y")#self.__get_lags(sparse)
        self.y = mvts["y"]#.loc[self.X.index]
        
        self.min_year = self.X.index.min()
        self.estimator = self.estimator.fit(self.X, self.y)
        return self
    
    """
    Parameters:
        -> years: List of years for which to make the prediction. Must be an iterable
        -> country: 'str', Country for which to make the prediction
    
    Description:
        Iterates over the years and makes the predicted value using 
        data from {lags} last years from that country.
    """
    def predict(self, years: Sequence, country: str):
        encoded_country = list(self.encoder.transform([country])[0])
        
        X = []
        lags = range(1, self.lags+1)
        for year in years:
            assert(year>=self.min_year)
            X += [ 
                [self.get_year(year - i, country) for i in lags] + encoded_country
            ]
            
        #model needed to have the labels that used in training
        X = pd.DataFrame(data = X,
                        columns = self.estimator.feature_names_in_)   
            
        return self.estimator.predict( X )
    
    """
    Parameters:
        -> year: 'int', year from which to get the data
        -> country: 'str', country from which to get the data
        
    Description:
        Returns data from {raw} located in ['year', 'country']. 
        If this data does not exist ( e.g. was not present in the training set) it
        is first predicted recursively. After this it is memoized in raw table, to
        avoid repeating the recursion.
    """
    def get_year(self, year: int, country : str) -> float:
        try: 
            data = self.raw.loc[year, country]
            if data == np.nan:
                raise Exception()
            return data
        except:
            pred = self.predict([year], country)[0]
            self.raw.loc[year, country] = pred
            return pred
    
    """
    Parameters:
        -> country: 'str' country label from which to get the data
    
    Description:
        Returns the timeseries that corresponds to 'country'
    """
    def get_country(self, country:str):
        return self.y[self.X[country]==1]
    
    """
    Parameters:
        -> df: MVTS to train the model. Should be in format of "reshape" function output
        -> sort: 'bool', tag to sort data based on index, default = 'False'
    
    Description:
        Creates a Matrix where each instance has the reference to the country it belongs to, one hot encoded.
        Besides this it also has the original time-series from 'df' in a column named "y", and {lags} columns with
        the data from previous years.
        It drops columns which do not have at least {lags} previous years to reference.
    """
    def __to_lagged_mvts(self, df: pd.DataFrame, sort=False):
        self.encoder = LabelBinarizer().fit(df.columns)
        new_df = pd.DataFrame(columns = ["y"] + list(df.columns))
        
        for col in df.columns:
            tseries = df[col]
            tseries.name = "y"
            encoded_country = pd.DataFrame(data=list(self.encoder.transform([col])) * len(tseries), 
                                           columns=self.encoder.classes_,
                                           index=tseries.index)
            
            tseries = pd.concat((tseries, 
                                 self.__get_lags(tseries), 
                                 encoded_country), 
                                 axis=1).dropna()
            new_df = pd.concat((new_df, tseries), sort=True)
        if sort: 
            return new_df.sort_index()
        return new_df
    
    """
    Parameters:
        -> X: Time-Series
    Description:
        Iteratively creates a matrix. Each column corresponds to the data
        from the ith previous year.
    """
    def __get_lags(self, X: pd.Series):
        return pd.DataFrame(
            data={ f"-{i}": X.shift(i) for i in range(1,self.lags+1) }
        )

In [None]:
model = ForecastModel().fit(reshape(pop))

In [None]:
countries = list(pop["Country Name"].sample(n=10, random_state=66))
pprint(list(enumerate(countries)))

In [None]:
label = countries[5]

# Predicting on the **Training** Set for a Series of Years

In [None]:
train_pred = model.predict([2016], label)
real = model.raw.loc[2016,label]
err = mean_squared_error([real], train_pred, squared=False)
print(f"""Predicted value is: {train_pred[0]:.3f};
True value is: {real:.3f};

Root Mean Squared Error is: {err:.3f}
Error in relation to mean is: {err/model.get_country(label).mean():.3f}""")

In [None]:
min_year = 2000;
max_year = 2016;

In [None]:
preds = model.predict(list(range(min_year, max_year+1)), label)

In [None]:
def plot_sub(ax, x, y, label="", xlabel="", ylabel="", title=""):
    ax.plot(x, y, label=label)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    if label!="":
        ax.legend()

def scat_sub(ax, x, y, label="", xlabel="", ylabel="", title=""):
    ax.scatter(x, y, label=label)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    if label!="":
        ax.legend()

f, ax = plt.subplots(1,2, figsize=(16,6))
plot_sub(ax[0], range(min_year,max_year+1), model.get_country(label).loc[min_year:max_year], label="True Values")
plot_sub(ax[0], range(min_year,max_year+1), preds, label="Predicted Values", xlabel="Year")

scat_sub(ax[1], preds, model.get_country(label).loc[min_year:max_year], xlabel="Predicted", ylabel="True")
xy=np.linspace(preds.min(), preds.max())
ax[1].plot(xy,xy, "r")

# Predicting on the **Test** Set for a Series of Years

In [None]:
y_test=pd.read_csv("data/country_population_test.csv", index_col=0)[label]
y_test

In [None]:
preds = model.predict(y_test.index, label)
preds

In [None]:
print(f"rmse = {mean_squared_error( y_test, preds, squared=False)}")

In [None]:
print(f"Pearson Corr. Coef. = {np.corrcoef(y_test, preds)[0,1]}")

In [None]:
f, ax = plt.subplots(1,2, figsize=(16,6))
plot_sub(ax[0], y_test.index, y_test, label="True Values")
plot_sub(ax[0], y_test.index, preds, label="Predicted Values", xlabel="Year")

scat_sub(ax[1], preds, y_test, xlabel="Predicted", ylabel="True")
xy=np.linspace(min(preds), max(preds))
ax[1].plot(xy,xy, "r")

# Predicting 2017 on the **Test** Set for several countries

In [None]:
y_test=pd.read_csv("data/country_population_test.csv", index_col=0).loc[2017]
y_test

In [None]:
preds = [ model.predict([2017], label)[0] for label in countries]
preds

In [None]:
print(f"rmse = {mean_squared_error( y_test, preds, squared=False)}")

In [None]:
print(f"Pearson Corr. Coef. = {np.corrcoef(y_test, preds)[0,1]}")

In [None]:
x_ = range(len(countries))
f, ax = plt.subplots(1,2, figsize=(16,6))
plot_sub(ax[0], x_, y_test, label="True Values")
plot_sub(ax[0], x_, preds, label="Predicted Values", xlabel="Country")
ax[0].set_xticks(x_, countries, rotation=45)

scat_sub(ax[1], preds, y_test, xlabel="Predicted", ylabel="True")
xy=np.linspace(min(preds), max(preds))
ax[1].plot(xy,xy, "r")