# Code Appendix: Stage 2-3

## Part 0. Loading and preparing data

In [1]:
import pandas as pd
import glob
import numpy as np

### Load dictionary of predictor labels and descriptions

In [2]:
path = ("C:/Users/Ong Jia Yi/Desktop/STUDY/4b. Fall 2020 - Winter 2021/Winter "
      "- ECO421H (Macroeconomic Finance)/Assignments/Project/Data/"
      "dictionary.csv")

pred_dict = pd.read_csv(path)

In [3]:
pred_dict.tail()

Unnamed: 0,Key,Value
58,H5,"Balance of Payments, Supplementary Items, Curr..."
59,M1,Mean of this month's stock price
60,M2,Variance of this month's stock price
61,M3,Skewness of this month's stock price
62,M4,Kurtosis of this month's stock price


### Load and prepare datasets of predictors

In [4]:
path = ("C:/Users/Ong Jia Yi/Desktop/STUDY/4b. Fall 2020 - Winter 2021/Winter "
      "- ECO421H (Macroeconomic Finance)/Assignments/Project/Data/"
      "Classification Datasets")

filenames = glob.glob(path + "/*.csv")

data_predictors_load = []
for filename in filenames:
    data_predictors_load.append(pd.read_csv(filename))

In [5]:
path = ("C:/Users/Ong Jia Yi/Desktop/STUDY/4b. Fall 2020 - Winter 2021/Winter "
      "- ECO421H (Macroeconomic Finance)/Assignments/Project/Data/"
      "Stock Datasets")

filenames = glob.glob(path + "/*ex1*.csv")

data_predictors_load2 = []
for filename in filenames:
    data_predictors_load2.append(pd.read_csv(filename))

In [6]:
data_predictors = []
for i in range(len(data_predictors_load)):
    a = data_predictors_load[i]
    
    # full variables: ["mon_y", "mean", "var", "skew", "kurt", "mon"]
    b = data_predictors_load2[i].loc[:, ["mon"]]
    # full variables: ["M1", "M2", "M3", "M4", "mon"]
    b.set_axis(["mon"], axis=1, inplace=True)
    
    # choose whether to bind additional data
    data_predictors.append(pd.concat([a, b], axis=1))

In [7]:
# displaying indices
for i in range(len(data_predictors)):
    d = data_predictors[i]
    length = d.shape
    label = np.unique(d.Country)[0]
    print(i, "{:<6}".format(label),
          "dim: {}".format(length),
         "from: {} to: {}".format(d.Date.iloc[0],
                                  d.Date.iloc[-1]))

0 CAN    dim: (84, 21) from: 2014M01 to: 2020M12
1 GER    dim: (84, 31) from: 2014M01 to: 2020M12
2 US     dim: (84, 27) from: 2014M01 to: 2020M12
3 ARG    dim: (84, 28) from: 2014M01 to: 2020M12
4 POR    dim: (84, 30) from: 2014M01 to: 2020M12
5 RUS    dim: (84, 25) from: 2014M01 to: 2020M12
6 CHN    dim: (84, 20) from: 2014M01 to: 2020M12
7 SAFR   dim: (84, 28) from: 2014M01 to: 2020M12
8 QAT    dim: (84, 22) from: 2014M01 to: 2020M12


### Load datasets of targets

In [8]:
path = ("C:/Users/Ong Jia Yi/Desktop/STUDY/4b. Fall 2020 - Winter 2021/Winter "
      "- ECO421H (Macroeconomic Finance)/Assignments/Project/Data/"
      "Stock Datasets")

filenames = glob.glob(path + "/*ex2*.csv")

data_targets_load = []
for filename in filenames:
    data_targets_load.append(pd.read_csv(filename))

In [9]:
# displaying indices
for i in range(len(data_targets_load)):
    d = data_targets_load[i]
    length = d.shape[0]
    label = np.unique(d.Index)[0]
    print(i, "{:<6}".format(label),
          "nrow: {}".format(length),
         "from: {} to: {}".format(d.mon_y.iloc[0],
                                  d.mon_y.iloc[-1]))

0 SPTSE  nrow: 84 from: Y2014M2 to: Y2021M1
1 DAXI   nrow: 84 from: Y2014M2 to: Y2021M1
2 SPX    nrow: 84 from: Y2014M2 to: Y2021M1
3 MERVAL nrow: 84 from: Y2014M2 to: Y2021M1
4 PSI    nrow: 84 from: Y2014M2 to: Y2021M1
5 RTSI   nrow: 84 from: Y2014M2 to: Y2021M1
6 SSE    nrow: 84 from: Y2014M2 to: Y2021M1
7 JSE    nrow: 84 from: Y2014M2 to: Y2021M1
8 QE     nrow: 84 from: Y2014M2 to: Y2021M1


In [10]:
np.unique([data_targets_load[i].columns 
          for i in range(len(data_targets_load))])

array(['1-ahead', '2-ahead', 'Index', 'current', 'mon', 'mon_y'],
      dtype=object)

Countries and their corresponding indexes are arranged in the same order in the lists. 

Target indexes start one month later and end two months later (for up to two-months ahead prediction).

#### Rename desired column to 't' for algorithms to select as target column

In [11]:
data_targets = [data_targets_load[i].rename(columns={"current":"t"})
                for i in range(len(data_targets_load))]

In [12]:
data_targets[0].head(1)

Unnamed: 0,t,1-ahead,2-ahead,Index,mon_y,mon
0,1,1,1,SPTSE,Y2014M2,2


## Part 1. Variable Selection with Logistic Classifier

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

#### Define function to fit LASSO and select important features

In [14]:
def top_variables(data, targets, country):
    """Selects variables by the LASSO algorithm.
    
    Args:
        data: a pd dataframe containing predictor data
        targets: a pd dataframe containing stock (target) data
        country: a string of the country name
    Returns:
        a tuple where element 0 is the country
        name and element 1 is a list of most important variables
        in descending order, as determined by LASSO.
    """

    data_X = data.drop(["Country", "Date"], axis=1)
    data_t = targets.t
    
    # STEP 1: fit LASSO
    lasso_model = LogisticRegression(max_iter = 10000, solver='liblinear', 
                                     penalty='l1')
    lasso_fit = lasso_model.fit(data_X, data_t)
    
    # STEP 2: extract features in order of importance
    importance = lasso_fit.coef_[0]
    ind = np.argsort(-abs(importance))
    # exclude variables 'shrunk' to zero
    top_list = [data_X.columns[i] for i in ind if importance[i] != 0]
    out = (country, top_list)
    
    return out

#### Iteratively apply function for each dataset

In [15]:
dct_topvars = dict()

for i in range(len(data_predictors)):
    data = data_predictors[i]
    targets = data_targets[i]
    country = np.unique(data.Country)[0]
    
    top = top_variables(data, targets, country)
    dct_topvars[top[0]] = top[1]

In [16]:
# display top 10 for each country and change in variable count
# from shrinkage
indexes, col1, col2 = [], [], []
j=0
for i in dct_topvars.keys():
    n = data_predictors[j].drop(["Country", "Date"], axis=1).shape[1]
    indexes.append(i)
    n = data_predictors[j].drop(["Country", "Date"], axis=1).shape[1]
    col1.append("{} -> {}".format(n, len(dct_topvars[i])))
    col2.append(', '.join(dct_topvars[i][:10]))
    j+=1

In [38]:
pd.DataFrame({"Shrinkage": col1, "Top 10": col2}, index=indexes)

Unnamed: 0,Shrinkage,Top 10
CAN,19 -> 17,"C2, mon, H2, F3, F2, E1, E3, G4, G3, H5"
GER,29 -> 18,"A6, C2, C1, F2, E3, E1, F4, F3, H2, F1"
US,25 -> 18,"C1, mon, A6, F2, H2, F4, E1, E3, F3, F1"
ARG,26 -> 24,"B5, B7, mon, B9, H2, B10, C2, B3, A2, B1"
POR,28 -> 21,"B19, B17, mon, A6, C2, C1, H2, F2, F3, E1"
RUS,23 -> 20,"mon, A6, C2, B3, C1, H2, F3, F2, E3, H4"
CHN,18 -> 15,"A6, C2, mon, H2, F2, F3, E2, E1, H4, H3"
SAFR,26 -> 18,"B11, C2, A6, mon, F2, F3, E1, E3, H1, H4"
QAT,20 -> 19,"B9, B5, B7, B1, B3, mon, F2, F3, H2, E1"


## Part 2. Fitting Classifiers and Evaluating Predictability of Returns

In [18]:
from sklearn.metrics import roc_auc_score

#### Define function for Walk-Forward Validation

In [19]:
def walk_forward_cv(data, target, min_start, model):
    """Implements the walk-foward cross-validation algorithm.
    
    Args:
        data: a pd dataframe of predictor data
        target: a pd series of the binary target variable
        min_start: (int) parameter for the starting training sample size
        model: a sklearn model object
    
    Precondition: (at least as many rows as columns, less than maximum rows)
        data.shape[1] <= min_start <= len(target) - 1
        
    Returns:
        a tuple. first element is a list of training misclassification rates 
        (total misclassified cases divided by total training cases). second
        element is a float of 1-step-ahead validation misclassification rate 
        (total misclassified cases divided by total validation cases).
    """
    miss_train, miss_test = [], []
    n = len(target)
    
    i = min_start
    while i <= n - 1:
        X_train = data[:i]
        y_train = target[:i]
        X_test = data[i:i + 1]
        y_test = target[i:i + 1]
        
        model_fit = model.fit(X_train, y_train)
        pred_train = model_fit.predict(X_train)
        pred_test = model_fit.predict(X_test)
        
        miss_train.append(sum(pred_train != y_train)/len(y_train))
        miss_test.append(sum(pred_test != y_test))
        
        i += 1
        
    err_v = sum(miss_test)/(n - min_start)
    
    return (miss_train, err_v)

#### Define function to evaluate predictability with model on all datasets

In [20]:
def eval_model(model, min_start, countries, data_predictors,
               data_targets, dct_topvars, top_n):
    """Evaluates predictability with <model> based on
    walk-forward cross-validation error rates and AUC score.
    
    Args:
        model: a sklearn model object
        min_start: (int) parameter for the starting training sample size
                    for the walk-forward cv algorithm
        countries: a list of country labels ordered by the 
                   sequence of appearance in <data_predictors>
        data_predictors: a list of datasets of predictors
        data_targets: a list of datasets of the target variable
        dct_topvars: a dictionary mapping country labels to their
                     most important features
    Returns:
        a pd dataframe where the columns are the walk-forward
        cross-validation training and validation error and the
        AUC score. Each row is the scores for each country.
    """
    err_t, err_v, auc = [], [], []
    
    for i in range(len(data_predictors)):
        data = data_predictors[i].drop(["Country", "Date"], axis=1)
        
        # use only top_n most important features
        if top_n == "all":
            select = dct_topvars[countries[i]]
        else:
            select = dct_topvars[countries[i]][:top_n]
        data = data.loc[:,select]
        target = data_targets[i].t
        
        # runs walk_forward cv algorithm
        cv = walk_forward_cv(data, target, min_start, model)
        err_t.append(np.mean(cv[0]))
        err_v.append(cv[1])
        
        # computes AUC score using 70-30 train-test split
        X_train, X_test, y_train, y_test = \
        train_test_split(data, target, test_size = 0.3, random_state = 0)
        model_fit = model.fit(X_train, y_train)
        pred = model_fit.predict(X_test)
        score = roc_auc_score(y_test, pred)
        auc.append(score)
        
    out = pd.DataFrame({"Training Error (mean)": err_t,
                        "Validation Error": err_v,
                        "AUC Scores": auc})
    out.set_axis(countries, axis=0, inplace=True)
        
    return out

In [21]:
countries = list(dct_topvars.keys())
countries

['CAN', 'GER', 'US', 'ARG', 'POR', 'RUS', 'CHN', 'SAFR', 'QAT']

### 1. Evaluate predictability with standard Logistic Classifier

In [22]:
model1 = LogisticRegression(max_iter=10000)

In [23]:
%%time
e1 = eval_model(model1, 59, countries, data_predictors,
                data_targets, dct_topvars, 10)

Wall time: 8.83 s


In [24]:
round(e1, 4)

Unnamed: 0,Training Error (mean),Validation Error,AUC Scores
CAN,0.3099,0.4,0.5655
GER,0.3769,0.52,0.7188
US,0.3194,0.28,0.5242
ARG,0.3616,0.48,0.3273
POR,0.4153,0.64,0.5694
RUS,0.302,0.52,0.5686
CHN,0.3824,0.36,0.55
SAFR,0.3455,0.48,0.5
QAT,0.3701,0.48,0.6091


### 2. Evaluate predictability with Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
model2 = RandomForestClassifier(random_state=0, max_depth=10,
                                min_samples_leaf=2)

In [27]:
%%time
e2 = eval_model(model2, 59, countries, data_predictors,
                data_targets, dct_topvars, 10)

Wall time: 30.5 s


In [28]:
round(e2, 5)

Unnamed: 0,Training Error (mean),Validation Error,AUC Scores
CAN,0.04792,0.44,0.55952
GER,0.01727,0.56,0.56875
US,0.03551,0.32,0.52424
ARG,0.0056,0.52,0.51818
POR,0.01717,0.48,0.65972
RUS,0.03011,0.52,0.62745
CHN,0.00988,0.72,0.45625
SAFR,0.01833,0.48,0.53846
QAT,0.01625,0.56,0.50606


### 3. Evaluate predictability with KNN(2-9)

In [29]:
from sklearn.neighbors import KNeighborsClassifier

In [30]:
model3_1 = KNeighborsClassifier(n_neighbors = 2)
model3_2 = KNeighborsClassifier(n_neighbors = 3)
model3_3 = KNeighborsClassifier(n_neighbors = 4)
model3_4 = KNeighborsClassifier(n_neighbors = 5)
model3_5 = KNeighborsClassifier(n_neighbors = 6)
model3_6 = KNeighborsClassifier(n_neighbors = 7)
model3_7 = KNeighborsClassifier(n_neighbors = 8)
model3_8 = KNeighborsClassifier(n_neighbors = 9)

In [31]:
%%time
n_start = 59
n = 10 # specify number of top variables to use, or "all"
e3_1 = eval_model(model3_1, n_start, countries, data_predictors,
                  data_targets, dct_topvars, n)
e3_2 = eval_model(model3_2, n_start, countries, data_predictors,
                  data_targets, dct_topvars, n)
e3_3 = eval_model(model3_3, n_start, countries, data_predictors,
                  data_targets, dct_topvars, n)
e3_4 = eval_model(model3_4, n_start, countries, data_predictors,
                  data_targets, dct_topvars, n)
e3_5 = eval_model(model3_5, n_start, countries, data_predictors,
                  data_targets, dct_topvars, n)
e3_6 = eval_model(model3_6, n_start, countries, data_predictors,
                  data_targets, dct_topvars, n)
e3_7 = eval_model(model3_7, n_start, countries, data_predictors,
                  data_targets, dct_topvars, n)
e3_8 = eval_model(model3_8, n_start, countries, data_predictors,
                  data_targets, dct_topvars, n)

Wall time: 13.7 s


In [32]:
table3 = pd.concat([e3_1, e3_2, e3_3, e3_4, e3_5, e3_6, e3_7, e3_8], axis=1)
table3.columns = [' | '.join([i, j]) 
                 for i in ["k=2", "k=3", "k=4", "k=5", "k=6", "k=7", "k=8", "k=9"] 
                 for j in ["err_t", "err_v", "auc"]]

In [33]:
# validation errors
table3.loc[:, table3.columns.str.contains("/*err_v")]

Unnamed: 0,k=2 | err_v,k=3 | err_v,k=4 | err_v,k=5 | err_v,k=6 | err_v,k=7 | err_v,k=8 | err_v,k=9 | err_v
CAN,0.48,0.6,0.64,0.6,0.6,0.48,0.48,0.48
GER,0.56,0.6,0.6,0.52,0.52,0.4,0.56,0.44
US,0.36,0.48,0.32,0.36,0.36,0.36,0.28,0.36
ARG,0.52,0.56,0.52,0.6,0.6,0.56,0.68,0.68
POR,0.44,0.4,0.36,0.4,0.48,0.48,0.48,0.32
RUS,0.52,0.68,0.56,0.56,0.48,0.6,0.48,0.64
CHN,0.56,0.68,0.6,0.6,0.56,0.56,0.52,0.52
SAFR,0.48,0.44,0.4,0.44,0.36,0.4,0.36,0.52
QAT,0.48,0.52,0.4,0.4,0.28,0.24,0.32,0.28


### 4. Evaluating predictability with Naive Bayes Classifier

In [34]:
from sklearn.naive_bayes import GaussianNB

In [39]:
model4 = GaussianNB()

In [40]:
%%time
e4 = eval_model(model4, 59, countries, data_predictors,
                data_targets, dct_topvars, 10)

Wall time: 1.11 s


In [41]:
e4

Unnamed: 0,Training Error (mean),Validation Error,AUC Scores
CAN,0.380772,0.52,0.452381
GER,0.366569,0.68,0.6
US,0.385313,0.32,0.515152
ARG,0.417207,0.56,0.406061
POR,0.40222,0.52,0.513889
RUS,0.47866,0.48,0.558824
CHN,0.39559,0.52,0.575
SAFR,0.378957,0.44,0.461538
QAT,0.423812,0.36,0.627273


### 5. Evaluating predictability with Multilayer Perceptron Classifier

In [42]:
from sklearn.neural_network import MLPClassifier

In [43]:
model5_1 = MLPClassifier(hidden_layer_sizes=(100,)*1,
                         activation="relu", max_iter=1000)
model5_2 = MLPClassifier(hidden_layer_sizes=(100,)*2,
                         activation="relu", max_iter=1000)
model5_3 = MLPClassifier(hidden_layer_sizes=(100,)*3,
                         activation="relu", max_iter=1000)
model5_4 = MLPClassifier(hidden_layer_sizes=(100,)*4,
                         activation="relu", max_iter=1000)
model5_5 = MLPClassifier(hidden_layer_sizes=(100,)*5,
                         activation="relu", max_iter=1000)
model5_6 = MLPClassifier(hidden_layer_sizes=(100,)*6,
                         activation="relu", max_iter=1000)
model5_7 = MLPClassifier(hidden_layer_sizes=(100,)*7,
                         activation="relu", max_iter=1000)

In [44]:
%%time
n_start = 59
n = 10
e5_1 = eval_model(model5_1, n_start, countries, data_predictors,
                  data_targets, dct_topvars, "all")
e5_2 = eval_model(model5_2, n_start, countries, data_predictors,
                  data_targets, dct_topvars, "all")
e5_3 = eval_model(model5_3, n_start, countries, data_predictors,
                  data_targets, dct_topvars, "all")
e5_4 = eval_model(model5_4, n_start, countries, data_predictors,
                  data_targets, dct_topvars, "all")
e5_5 = eval_model(model5_5, n_start, countries, data_predictors,
                  data_targets, dct_topvars, "all")
e5_6 = eval_model(model5_6, n_start, countries, data_predictors,
                  data_targets, dct_topvars, "all")
e5_7 = eval_model(model5_6, n_start, countries, data_predictors,
                  data_targets, dct_topvars, "all")

Wall time: 2min 44s


In [45]:
table5 = pd.concat([e5_1, e5_2, e5_3, e5_4, e5_5, e5_6, e5_7], axis=1)
table5.columns = [' | '.join([i, j]) 
                 for i in ["d=1", "d=2", "d=3", "d=4", "d=5", "d=6", "d=7"] 
                 for j in ["err_t", "err_v", "auc"]]

In [46]:
table5.loc[:,table5.columns.str.contains("/*err_v")]

Unnamed: 0,d=1 | err_v,d=2 | err_v,d=3 | err_v,d=4 | err_v,d=5 | err_v,d=6 | err_v,d=7 | err_v
CAN,0.48,0.44,0.44,0.48,0.68,0.48,0.52
GER,0.6,0.56,0.48,0.56,0.56,0.36,0.6
US,0.56,0.56,0.32,0.56,0.44,0.4,0.36
ARG,0.4,0.52,0.56,0.6,0.28,0.48,0.44
POR,0.4,0.48,0.36,0.6,0.48,0.48,0.6
RUS,0.6,0.64,0.52,0.64,0.52,0.52,0.64
CHN,0.56,0.56,0.56,0.32,0.4,0.48,0.48
SAFR,0.56,0.32,0.36,0.36,0.76,0.6,0.56
QAT,0.52,0.36,0.44,0.68,0.76,0.56,0.56


## Displaying model performance for one country

In [153]:
country = "CAN"

In [154]:
e1[e1.index == country]

Unnamed: 0,Training Error (mean),Validation Error,AUC Scores
CAN,0.309915,0.4,0.565476


In [155]:
e2[e2.index == country]

Unnamed: 0,Training Error (mean),Validation Error,AUC Scores
CAN,0.04792,0.44,0.559524


In [156]:
table3.loc[table3.index == country, table3.columns.str.contains("/*k=[1-5]")]

Unnamed: 0,k=2 | err_t,k=2 | err_v,k=2 | auc,k=3 | err_t,k=3 | err_v,k=3 | auc,k=4 | err_t,k=4 | err_v,k=4 | auc,k=5 | err_t,k=5 | err_v,k=5 | auc
CAN,0.16423,0.48,0.64881,0.186119,0.6,0.583333,0.190328,0.64,0.529762,0.278924,0.6,0.595238


In [157]:
table3.loc[table3.index == country, table3.columns.str.contains("/*k=[6-9]")]

Unnamed: 0,k=6 | err_t,k=6 | err_v,k=6 | auc,k=7 | err_t,k=7 | err_v,k=7 | auc,k=8 | err_t,k=8 | err_v,k=8 | auc,k=9 | err_t,k=9 | err_v,k=9 | auc
CAN,0.340473,0.6,0.488095,0.332264,0.48,0.511905,0.392488,0.48,0.482143,0.341355,0.48,0.440476


In [158]:
e4[e4.index == country]

Unnamed: 0,Training Error (mean),Validation Error,AUC Scores
CAN,0.380772,0.52,0.452381


In [159]:
table5.loc[table5.index == country, table5.columns.str.contains("/*d=[1-3]")]

Unnamed: 0,d=1 | err_t,d=1 | err_v,d=1 | auc,d=2 | err_t,d=2 | err_v,d=2 | auc,d=3 | err_t,d=3 | err_v,d=3 | auc
CAN,0.466445,0.48,0.470238,0.473343,0.44,0.630952,0.468649,0.44,0.494048


In [160]:
table5.loc[table5.index == country, table5.columns.str.contains("/*d=[4-7]")]

Unnamed: 0,d=4 | err_t,d=4 | err_v,d=4 | auc,d=5 | err_t,d=5 | err_v,d=5 | auc,d=6 | err_t,d=6 | err_v,d=6 | auc,d=7 | err_t,d=7 | err_v,d=7 | auc
CAN,0.478317,0.48,0.5,0.505157,0.68,0.553571,0.485966,0.48,0.5,0.48731,0.52,0.5
