In [2]:
import pandas as pd 
import numpy as np

In [119]:
from utils.utils import *
from utils.clarkWestTest import clarkWestTest

In [199]:
def readFile(architecture, dataset, variable, hidden = None):
    if(hidden is not None):
        if(variable == 'ALL'):
            results = pd.read_parquet('output/' + str(architecture) + '_' + str(dataset) +'_' + str(hidden).replace('[', '').replace(']', '').replace(', ', '_') +  '.gzip')
        else:
             results = pd.read_parquet('output/' + str(architecture) + '_' + str(dataset) +'_' + str(hidden).replace('[', '').replace(']', '').replace(', ', '_') + '_' + str(variable).replace(' ', '').replace('%', '') + '.gzip')
    else: 
        if(variable == 'ALL'):
            results = pd.read_parquet('output/' + str(architecture) + '_' + str(dataset) + '.gzip')
        else:
            results = pd.read_parquet('output/' + str(architecture) + '_' + str(dataset) +'_' + str(variable).replace(' ', '').replace('%', '') + '.gzip')
            
    return results

# Amalagamation
The main idea is to do an amalgamation per variable: i.e. the predictions of all architecture at time point t and take the average of those. This begs the question of how to combine them. For example LSTM1/CNN1 does not have the same structure as FNN1. 

Amalgamtion of the all models should be doable, things get more complicated once we start looking at all the variables.

* Amalagamation MEV
* Amalgamation TA
* Amalgamtion ALL 
* Amalgamation PCA MEV
* Amalgamation PCA TA
* Amalgamation PCA ALL
* Amalgamtion per MEV variable 
* Amalgamation per TA variable 

In [243]:
def amalgamateResults(results, architectures, dataset, PCA = False): 
    """
    The main idea is that I try to aggregate the predictions over all architectures with all hidden units for a single variable. Thus aggregatedict is a dictionairy with dataframe of all the predictions per varaible.
    Thus aggregatedDict{'DP'} would yield a dataframe where each column is the prediction vector of a architecture. Aka, when you take the row wise average you have an amalgamation for a variable based on all models. 
    """
    aggregatedDict = dict()
    variables = results.Dataset.unique()
    models = results.Method.unique()

    
    for architecture in architectures:
        # Set the hidden unit definition depending on the architecture
        if(architecture == 'CNN' or architecture == 'RF'):
            hidden_sizes = None
        elif(architecture == 'MLP' or architecture == 'LSTM'):
            hidden_sizes = [32, 16, 8, 4, 2]
        else:
            hidden_sizes = [[32], [32, 16], [32, 16, 8], [32, 16, 8, 4], [32, 16, 8, 4, 2]] 

        for variable in variables:
            variable = variable.replace(dataset+': ', '')
            
            # Edge case for PCA dataset with different naming convention.
            if(PCA == True):
                variable = 'PCA'
            if(hidden_sizes is not None):
                for hidden in hidden_sizes:
                    #If we are dealing with ALL models, then file naming has a different sctructure.
                    if(variable == 'ALL'):
                        results = pd.read_parquet('output/' + str(architecture) + '_' + str(dataset) +'_' + str(hidden).replace('[', '').replace(']', '').replace(', ', '_') +  '.gzip')
                    else:
                        #For each variation of a certain variable read the relevant file and concatenate the predictions to the dataframe stored in the aggrefatedDict dictionairy for said variable.
                        results = pd.read_parquet('output/' + str(architecture) + '_' + str(dataset) +'_' + str(hidden).replace('[', '').replace(']', '').replace(', ', '_') + '_' + str(variable).replace(' ', '').replace('%', '') + '.gzip')
                    
                    try:
                        df = pd.concat([aggregatedDict.get(variable), results.Pred], axis = 1)
                    except:
                        df = results.Pred
                    aggregatedDict.update({variable: df})

            #For the architectures withouth hidden units, those are irrelevant. 
            elif(hidden_sizes is None):
                #If we are dealing with ALL models, then file naming has a different sctructure.
                if(variable == 'ALL'):
                    results = pd.read_parquet('output/' + str(architecture) + '_' + str(dataset) + '.gzip')
                else:
                    results = pd.read_parquet('output/' + str(architecture) + '_' + str(dataset) +'_' + str(variable).replace(' ', '').replace('%', '') + '.gzip')
                    
                try:
                    df = pd.concat([aggregatedDict.get(variable), results.Pred], axis = 1)
                except:
                    df = results.Pred
                aggregatedDict.update({variable: df})
                
    return aggregatedDict
            
   

In [270]:
def getAmalgamationResults(results, aggregatedDict, dataset, PCA = False):
    resultsDF = pd.DataFrame(columns=['Method', 'Dataset', 'R2', 'CW', 'DA', 'DA HA', 'MSFE', 'MSFE HA'])
    variables = results.Dataset.unique()
    models = results.Method.unique()
    
    # Get the amalgamation performance for each variable
    for variable in variables:
        variable = variable.replace(dataset+': ', '')
        
        if(PCA == True):
            variable = 'PCA'
            
        #Get the amalgamated (row wise average over all model predictions) prediction for a variable
        pred = aggregatedDict.get(variable).mean(axis=1)

        #Get the actual and HA from any file, they are identical in all. 
        results = readFile('MLP', dataset , str(variable), [32])

        # Replace the predictions int he dataframe with the amalgamated predictions
        results.Pred = pred
        
        if(variable is not 'ALL'):
            # Edge case for PCA dataset with different naming convention.
            
            resultsDF = analyzeResults(results, resultsDF, 'Amalgamation', dataset + ': ' + str(variable))
        else: 
            resultsDF = analyzeResults(results, resultsDF, 'Amalgamation', str(variable))
    
    return resultsDF


### Amalgamation per variable MEV

In [249]:
# Variables setup: 
results = pd.read_excel(open('output/ALL.xlsx', 'rb'), sheet_name='MEV Variables', engine='openpyxl', index_col=0)
architectures = ['CNN', 'MLP', 'FNN']
dataset = 'MEV'

# Get amalgamated predictions
aggregatedDict = amalgamateResults(results, architectures, dataset)

# Get results based on amalgamation
resultsMEV = getAmalgamationResults(results, aggregatedDict, dataset = 'MEV')
resultsMEV

Unnamed: 0,Method,Dataset,R2,CW,DA,DA HA,MSFE,MSFE HA
0,Amalgamation,MEV: DP,-48.738,-0.56,48.3,49.69,1.62,1.09
1,Amalgamation,MEV: DY,-1.256,-0.66,45.52,49.69,1.11,1.09
2,Amalgamation,MEV: EP,-0.922,0.57,47.53,49.69,1.1,1.09
3,Amalgamation,MEV: DE,-15.519,-0.83,48.61,49.69,1.26,1.09
4,Amalgamation,MEV: RVOL,-0.537,0.44,48.61,49.69,1.1,1.09
5,Amalgamation,MEV: BM,-3.147,0.76,51.39,49.69,1.13,1.09
6,Amalgamation,MEV: NTIS,-0.046,1.02,50.15,49.69,1.09,1.09
7,Amalgamation,MEV: TBL (ann %),-0.507,0.3,45.52,49.69,1.1,1.09
8,Amalgamation,MEV: LTY (ann %),-1.643,0.34,50.0,49.69,1.11,1.09
9,Amalgamation,MEV: LTR (%),-1.022,-0.05,46.91,49.69,1.1,1.09


### Amalgamation per variable TA

In [250]:
# Variables setup: 
results = pd.read_excel(open('output/ALL.xlsx', 'rb'), sheet_name='TA Variables', engine='openpyxl', index_col=0)
architectures = ['CNN', 'MLP', 'FNN']
dataset = 'TA'

# Get amalgamated predictions
aggregatedDict = amalgamateResults(results, architectures, dataset)

# Get results based on amalgamation
resultsTA = getAmalgamationResults(results, aggregatedDict, dataset = 'TA')
resultsTA

Unnamed: 0,Method,Dataset,R2,CW,DA,DA HA,MSFE,MSFE HA
0,Amalgamation,"TA: MA(1,9)",-1.926,-1.29,47.38,49.69,1.11,1.09
1,Amalgamation,"TA: MA(1,12)",-0.755,0.06,48.77,49.69,1.1,1.09
2,Amalgamation,"TA: MA(2,9)",-2.012,-1.25,48.92,49.69,1.11,1.09
3,Amalgamation,"TA: MA(2,12)",-0.295,0.83,46.91,49.69,1.1,1.09
4,Amalgamation,"TA: MA(3,9)",0.028,1.35*,52.16,49.69,1.09,1.09
5,Amalgamation,"TA: MA(3,12)",-1.167,-0.84,47.69,49.69,1.1,1.09
6,Amalgamation,TA: MOM(9),-0.601,-0.05,50.77,49.69,1.1,1.09
7,Amalgamation,TA: MOM(12),-2.162,-1.21,46.76,49.69,1.12,1.09
8,Amalgamation,"TA: VOL(1,9)",-1.756,-0.94,49.07,49.69,1.11,1.09
9,Amalgamation,"TA: VOL(1,12)",-0.621,0.13,51.7,49.69,1.1,1.09


### Amalgamation for ALL model (MEV + TA)

In [251]:
# Variables setup: 
results = pd.read_excel(open('output/ALL.xlsx', 'rb'), sheet_name='Accuracy All', engine='openpyxl', index_col=0)
architectures = ['CNN', 'MLP', 'FNN']
dataset = 'ALL'

# Get amalgamated predictions
aggregatedDict = amalgamateResults(results, architectures, dataset)

# Get results based on amalgamation
resultsALL = getAmalgamationResults(results, aggregatedDict, dataset = 'ALL')
resultsALL

Unnamed: 0,Method,Dataset,R2,CW,DA,DA HA,MSFE,MSFE HA
0,Amalgamation,ALL: ALL,-2.379,1.6*,50.31,49.69,1.12,1.09


### Amalgamation for PCA model (MEV, TA, MEV + TA)

In [271]:
results = pd.read_excel(open('output/ALL.xlsx', 'rb'), sheet_name='Accuracy PCA', engine='openpyxl', index_col=0)
results = results[results.Dataset == 'MEV']
architectures = ['CNN', 'MLP', 'FNN']

# Get amalgamated predictions
aggregatedDict = amalgamateResults(results, architectures, 'MEV', PCA = True)

# Get results based on amalgamation
resultsPCA = getAmalgamationResults(results, aggregatedDict, dataset = 'MEV', PCA = True)

# Redo analysis for TA only PCA models
results = pd.read_excel(open('output/ALL.xlsx', 'rb'), sheet_name='Accuracy PCA', engine='openpyxl', index_col=0)
results = results[results.Dataset == 'TA']
aggregatedDict = amalgamateResults(results, architectures, 'TA', PCA = True)
resultsPCA = resultsPCA.append(getAmalgamationResults(results, aggregatedDict, dataset = 'TA', PCA = True))

# Redo analysis for TA+MEV PCA models
results = pd.read_excel(open('output/ALL.xlsx', 'rb'), sheet_name='Accuracy PCA', engine='openpyxl', index_col=0)
results = results[results.Dataset == 'ALL']
aggregatedDict = amalgamateResults(results, architectures, 'ALL', PCA = True)
resultsPCA = resultsPCA.append(getAmalgamationResults(results, aggregatedDict, dataset = 'ALL', PCA = True))

resultsPCA

Unnamed: 0,Method,Dataset,R2,CW,DA,DA HA,MSFE,MSFE HA
0,Amalgamation,MEV: PCA,-0.722,1.99**,50.15,49.69,1.1,1.09
0,Amalgamation,TA: PCA,-1.817,0.51,49.38,49.69,1.11,1.09
0,Amalgamation,ALL: PCA,-7.657,-0.96,46.6,49.69,1.18,1.09
