In [1]:
import pandas as pd
import os
import numpy as np
import plotly.express as px
import itertools
import matplotlib.pyplot as plt
import math
import plotly.graph_objects as go
import plotly.colors
from plotly.subplots import make_subplots        
from PIL import ImageColor
import pickle
import time
from scipy.spatial import distance
import dcor

from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.neighbors import LocalOutlierFactor


# Framework for automated evaluation of column tuples

The aim is to provide a routine that gets fed a dataframe / csv-file, runs data analysis routines and returns metrics and plots à la

`metrics, plot = predictability(df, num_input_columns, num_target_columns, list_of_considered_columns, "analysis method")`

The metrics we use are 

| r2  |  RMSE | RMSE/std | MAPE  | RAE  | Distance correlation  |
| :-: | :-: | :-: | :-: | :-: | :-: |
| $$1-\frac{\sum (\hat{y}-y)^2}{\sum (\bar{y}-y)^2}$$ | $$\sqrt{\frac{1}{N} \sum (\hat{y}-y)^2 }$$ | $$\frac{1}{\sigma_y}\sqrt{\frac{1}{N} \sum (\hat{y}-y)^2 }$$ | $$\frac{1}{N}\sum \frac{\lvert \hat{y}-y \rvert}{\lvert y\rvert}$$ | $$\frac{\sum \lvert \hat{y}-y\rvert}{\sum \lvert \bar{y}-y\rvert }$$ |  [cf. documentation](https://dcor.readthedocs.io/en/latest/theory.html)<br/> [cf. paper](https://projecteuclid.org/journals/annals-of-statistics/volume-35/issue-6/Measuring-and-testing-dependence-by-correlation-of-distances/10.1214/009053607000000505.full)<br/> [cf. wiki](https://en.wikipedia.org/wiki/Distance_correlation) |

with

$y$: observed data<br/>
$\hat{y}$: predicted counterpart of $y$<br/>
$N$: number of $y$ and $\hat{y}$<br/>
$\bar{y}$: average of $y$<br/>
$\sigma_y$: standard deviation of $y$


# Open questions

* Should primary keys be considered or do we assume the data(frame) is handed over properly also with respect to fixing possible primary keys?\
In the example of Country Indicators data: Year and Country is not a normal data column. But it also depends on the choice of what we want to investigate whether we drop them or fix one of them.
* Should input be possible as dataframe only, or also as csv- / txt-file?
* We should also go through all relevant permutations of the data tuples.\
If we have, e.g., four data columns [A, B, C, D] and want to analyse 2-2 connections, it does not suffice to only consider input=[A, B], output=[C, D]. There may well be no causal connection between any of A,B and any of C,D, but instead between C and D. So we need to consider all $\frac{N!}{I!\cdot O!}$ many combinations, given $N$ data columns, $I$-many inputs and $O$-many outputs.
* Currently, RMSE/std relies on the standard deviation of the test values $y$. One could of course also use the overall available target values – the combined train and test values.
* Decision on whether outliers are extracted or not is currently based on *test* score.

### load example data

In [2]:
df = pd.read_csv("processed_country_indicators.csv").drop(columns=["Unnamed: 0"])

In [3]:
df

Unnamed: 0,Country Name,Year,"Agriculture, value added (% of GDP)",CO2 emissions (metric tons per capita),Domestic credit provided by financial sector (% of GDP),Electric power consumption (kWh per capita),Energy use (kg of oil equivalent per capita),Exports of goods and services (% of GDP),"Fertility rate, total (births per woman)",GDP growth (annual %),Imports of goods and services (% of GDP),"Industry, value added (% of GDP)","Inflation, GDP deflator (annual %)","Life expectancy at birth, total (years)",Population density (people per sq. km of land area),"Services, etc., value added (% of GDP)"
0,Afghanistan,1962,,0.073781,21.276422,,,4.878051,7.450,,9.349593,,,33.219902,14.312061,
1,Afghanistan,1967,,0.123782,9.917662,,,6.772908,7.450,,14.209827,,,35.389415,15.881812,
2,Afghanistan,1972,,0.130820,18.880833,,,14.763231,7.450,,18.105850,,,37.610146,17.947027,
3,Afghanistan,1977,,0.183118,13.836822,,,11.662904,7.449,,14.823175,,,40.110146,19.998926,
4,Afghanistan,1982,,0.165879,,,,,7.450,,,,,43.230732,19.402324,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2635,Zimbabwe,1987,14.407528,1.598217,74.161607,878.072691,896.673611,24.015710,5.784,1.150737,21.274886,32.451242,7.189361,61.753805,24.649495,53.141236
2636,Zimbabwe,1992,7.413793,1.533724,43.120518,778.695133,923.493407,27.227263,4.840,-9.015570,36.485231,40.862069,-14.129659,56.491976,28.485762,51.724135
2637,Zimbabwe,1997,18.934082,1.194678,63.058320,870.988697,804.508892,37.595273,4.237,2.680594,44.609791,25.554678,-2.879048,46.065902,31.174507,55.511236
2638,Zimbabwe,2002,14.029007,0.942795,164.559047,827.329873,772.676619,31.834799,4.018,-8.894023,34.972553,,2.712950,40.679146,32.807111,


In [4]:
# to work with 2007 data only
df2007 = df.loc[df["Year"]==2007]
df2007

Unnamed: 0,Country Name,Year,"Agriculture, value added (% of GDP)",CO2 emissions (metric tons per capita),Domestic credit provided by financial sector (% of GDP),Electric power consumption (kWh per capita),Energy use (kg of oil equivalent per capita),Exports of goods and services (% of GDP),"Fertility rate, total (births per woman)",GDP growth (annual %),Imports of goods and services (% of GDP),"Industry, value added (% of GDP)","Inflation, GDP deflator (annual %)","Life expectancy at birth, total (years)",Population density (people per sq. km of land area),"Services, etc., value added (% of GDP)"
9,Afghanistan,2007,30.622854,0.087858,0.535181,,,17.823714,6.437000,13.740205,58.350047,27.344703,22.382016,57.833829,39.637202,42.032443
19,Albania,2007,19.874798,1.322335,62.076755,1213.124369,679.861765,28.084222,1.635000,5.900000,54.788201,25.334355,3.576195,76.470293,108.394781,54.790848
29,Algeria,2007,7.688453,3.195865,-3.636038,891.805086,1075.194127,47.068163,2.661000,3.400000,24.869963,58.571640,6.395344,72.898366,14.385269,33.739908
39,American Samoa,2007,,,,,,,,,,,,,289.520000,
49,Andorra,2007,0.375713,6.350868,,,,,1.180000,0.078039,,15.871050,3.869917,,180.591489,75.213876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2599,West Bank and Gaza,2007,7.441636,0.665297,6.261647,,,19.366849,4.627000,-1.727004,77.810672,23.234322,5.198904,71.747049,580.481063,69.324043
2609,World,2007,4.051775,4.662414,158.374008,2816.334646,1803.918178,30.254030,2.542742,4.320227,29.126501,30.142415,5.632652,69.641854,51.452805,65.829479
2619,"Yemen, Rep.",2007,9.855417,0.966383,10.299008,207.132310,318.012608,35.903431,5.180000,3.338428,43.219993,48.730554,10.888282,61.954805,41.102913,32.218702
2629,Zambia,2007,13.234505,0.139614,13.458356,651.873083,588.018959,33.591193,5.911000,8.352436,32.180265,34.885422,12.970211,52.477146,17.135926,51.880073


#### primary keys

The above dataset contains the two primary keys "Country Name" and "Year" which are not supposed to be part of the analysis.

Primary keys can be used to "zoom in" on more detailed analyses. In this case, e.g., check how numbers for Afghanistan evolved over time. Or only take data from year 2007 and find a connection between two columns, then compare it to the same data but for 2002 etc.

Therefore, the primary keys have to be treated differently from the remaining *data columns*.

In [5]:
prim_keys = ["Country Name", "Year"]


# Helper routines

### relative absolute error

In [6]:

def rae(true, predicted):
    numerator = np.sum(np.abs(predicted - true))
    denominator = np.sum(np.abs(np.mean(true) - true))
    return numerator / denominator


### 2d plotting

In [7]:
def plot_2d_result(data_tuple, metrics, datas, show=False):
    
    # get max and min plotting values
    y_min = min(0, min(datas[data_tuple]["y_train"]), min(datas[data_tuple]["y_test"]), min(datas[data_tuple]["y_test_pred"]), min(datas[data_tuple]["outliers"][data_tuple[1]]))
    y_max = max(0, max(datas[data_tuple]["y_train"]), max(datas[data_tuple]["y_test"]), max(datas[data_tuple]["y_test_pred"]), max(datas[data_tuple]["outliers"][data_tuple[1]]))
    
    fig = make_subplots(
    rows=2, cols=1,
    row_heights=[0.8, 0.2],
    specs=[[{"type": "scatter"}],
           [{"type": "histogram"}]],
    vertical_spacing=.2
    )

    fig.add_trace(
        go.Scatter(x=datas[data_tuple]["X_train"].reshape(len(datas[data_tuple]["X_train"]),), 
                   y=datas[data_tuple]["y_train"],
                   xaxis="x",
                   yaxis="y",
                   name="y_train",
                   mode="markers", marker_color="Maroon", marker_size=3, opacity=.6
                    ),
        row=1, col=1
                 )
    fig.add_trace(
        go.Scatter(x=datas[data_tuple]["X_test"].reshape(len(datas[data_tuple]["X_test"]),), 
                   y=datas[data_tuple]["y_test_pred"], 
                   xaxis="x",
                   yaxis="y",
                   name="y_pred",
                   mode="markers", marker_color="LightSeaGreen", 
                    ),
        row=1, col=1
                 )
    fig.add_trace(
        go.Scatter(x=datas[data_tuple]["X_test"].reshape(len(datas[data_tuple]["X_test"]),), 
                   y=datas[data_tuple]["y_test"], 
                   xaxis="x",
                   yaxis="y",
                   name="y_test",
                   mode="markers", marker_color="LightSalmon", 
                    ),
        row=1, col=1
                 )
    
    # plot outliers
    if datas[data_tuple]["outlier_info"]=="included":
        fig.add_trace(
        go.Scatter(x=datas[data_tuple]["outliers"][data_tuple[0]],#.reshape(len(datas[data_tuple]["outliers"][data_tuple[0]]),), 
                   y=datas[data_tuple]["outliers"][data_tuple[1]], 
                   xaxis="x",
                   yaxis="y",
                   name="outliers (included)",
                   mode="markers", marker_color="DarkSlateBlue ", 
                    ),
        row=1, col=1
                 )
    elif datas[data_tuple]["outlier_info"]=="excluded":
        fig.add_trace(
        go.Scatter(x=datas[data_tuple]["outliers"][data_tuple[0]],#.reshape(len(datas[data_tuple]["outliers"][data_tuple[0]]),), 
                   y=datas[data_tuple]["outliers"][data_tuple[1]],
                   xaxis="x",
                   yaxis="y",
                   name="outliers (excluded)",
                   mode="markers", marker_color="DarkSlateBlue ", 
                    ),
        row=1, col=1
                 )
        
    # add metrics
    fig.add_annotation(text='<b>r2 MLP:   </b>'+str(round(metrics[data_tuple]["MLP r2"],2))+
                       ' <i><br>r2 lin. reg.:   </i>'+str(round(metrics[data_tuple]["linear r2"],2))+
                       ' <b><br>RMSE MLP:   </b>'+str(round(metrics[data_tuple]["MLP RMSE"],2))+
                       ' <i><br>RMSE lin. reg.:   </i>'+str(round(metrics[data_tuple]["linear RMSE"],2))+
                       ' <b><br>RMSE/std MLP:   </b>'+str(round(metrics[data_tuple]["MLP RMSE/std"],2))+
                       ' <i><br>RMSE/std lin. reg.:   </i>'+str(round(metrics[data_tuple]["linear RMSE/std"],2))+
                       ' <b><br>MAPE MLP:   </b>'+str(round(metrics[data_tuple]["MLP MAPE"],2))+
                       ' <i><br>MAPE lin. reg.:   </i>'+str(round(metrics[data_tuple]["linear MAPE"],2))+
                       ' <b><br>rae MLP:   </b>'+str(round(metrics[data_tuple]["MLP rae"],2))+
                       ' <i><br>rae lin. reg.:   </i>'+str(round(metrics[data_tuple]["linear rae"],2))+
                       ' <b><br>dcor MLP:   </b>'+str(round(metrics[data_tuple]["MLP dcor"],2))+
                       ' <i><br>dcor lin. reg.:   </i>'+str(round(metrics[data_tuple]["linear dcor"],2)),
                       #' <br>Spearman corr.:   '+str(round(metrics[data_tuple]["Spearman"],2))+
                       #' <br>Pearson corr.:   '+str(round(metrics[data_tuple]["Pearson"],2)), 
                        align='right',
                        showarrow=False,
                        xref='paper',
                        yref='paper',
                        x=1.223,
                        y=.73,
                        bgcolor="white",
                        #bordercolor='black',
                        #borderwidth=1
                      )
    
    # add line as separator
    fig.add_shape(type='line',
                x0=-.05,
                y0=.2,
                x1=1.05,
                y1=.2,
                line=dict(color='white',),
                xref='paper',
                yref='paper'
    )
    
    # histogram of errors
    fig.add_trace(
        go.Histogram(x=datas[data_tuple]["y_test_pred"]-datas[data_tuple]["y_test"],
                   xaxis="x2",
                   yaxis="y2",
                   name="prediction error",
                   nbinsx=100,
                   marker_color='Tomato'),
        row=2, col=1
                 )

    fig.update_layout(
        title=data_tuple[1]+'  vs.  '+data_tuple[0],
        xaxis=dict(
            title=data_tuple[0],
            gridcolor='white',
            gridwidth=2,
            #type='log',
        ),
        yaxis=dict(
            title=data_tuple[1],
            gridcolor='white',
            gridwidth=2,
            #type='log',
        ),
        yaxis_range=[y_min*1.01,y_max*1.01],
        xaxis2=dict(title=r"$\text{error } y_{pred}-y$"),
        yaxis2=dict(title="frequency"),
        legend=dict(bgcolor="white"),
        paper_bgcolor='rgb(243, 243, 243)',
        plot_bgcolor='rgb(243, 243, 243)',
        width=920,
        height=620
    )
    
    if show==True:
        fig.show()
    else:
        return fig

### outlier extraction

#### via class, if potentially inside pipeline

not in use!

In [8]:
'''
class OutlierExtractor(TransformerMixin):
    def __init__(self, **kwargs):
        """
        Create a transformer to remove outliers. A threshold is set for selection
        criteria, and further arguments are passed to the LocalOutlierFactor class

        Keyword Args:
            neg_conf_val (float): The threshold for excluding samples with a lower
               negative outlier factor.

        Returns:
            object: to be used as a transformer method as part of Pipeline()
        """

        self.threshold = kwargs.pop('neg_conf_val', -10.0)

        self.kwargs = kwargs

    def transform(self, X, y):
        """
        Uses LocalOutlierFactor class to subselect data based on some threshold

        Returns:
            ndarray: subsampled data

        Notes:
            X should be of shape (n_samples, n_features)
        """
        X = np.asarray(X)
        y = np.asarray(y)
        lcf = LocalOutlierFactor(**self.kwargs)
        lcf.fit(X)
        return (X[lcf.negative_outlier_factor_ > self.threshold, :],
                y[lcf.negative_outlier_factor_ > self.threshold])

    def fit(self, *args, **kwargs):
        return self
'''

'\nclass OutlierExtractor(TransformerMixin):\n    def __init__(self, **kwargs):\n        """\n        Create a transformer to remove outliers. A threshold is set for selection\n        criteria, and further arguments are passed to the LocalOutlierFactor class\n\n        Keyword Args:\n            neg_conf_val (float): The threshold for excluding samples with a lower\n               negative outlier factor.\n\n        Returns:\n            object: to be used as a transformer method as part of Pipeline()\n        """\n\n        self.threshold = kwargs.pop(\'neg_conf_val\', -10.0)\n\n        self.kwargs = kwargs\n\n    def transform(self, X, y):\n        """\n        Uses LocalOutlierFactor class to subselect data based on some threshold\n\n        Returns:\n            ndarray: subsampled data\n\n        Notes:\n            X should be of shape (n_samples, n_features)\n        """\n        X = np.asarray(X)\n        y = np.asarray(y)\n        lcf = LocalOutlierFactor(**self.kwargs)\n    

#### the standard way

In [9]:
def extract_outliers(data):
    
    extractor = LocalOutlierFactor(n_neighbors=20)
    
    data_extr_pred = extractor.fit_predict(data)
    
    outliers_index = np.where(data_extr_pred==-1)
    outliers = data.iloc[outliers_index]
    inliers_index = np.where(data_extr_pred==1)
    data_extr = data.iloc[inliers_index]
    
    data_scores = extractor.negative_outlier_factor_
    
    return data_extr, data_scores, outliers

### data preparation, train-test-split

In [10]:
def data_prep_split(data, inputs, outputs):
    # get x and y value(s)
    curr_x = np.array(data[inputs]).reshape(-1, 1)
    curr_y = np.array(data[outputs])

    # train test split
    curr_X_train, curr_X_test, curr_y_train, curr_y_test = train_test_split(curr_x, curr_y, random_state=1,
                                                                            test_size=.3, shuffle=True)
    curr_y_train = curr_y_train.ravel()
    curr_y_test = curr_y_test.ravel()
    
    return curr_X_train, curr_X_test, curr_y_train, curr_y_test

### scoring mapping

In [11]:
scoring_dict = {
    "r2": "r2",
    "MAPE": "neg_mean_absolute_percentage_error",
    "neg_mean_absolute_percentage_error": "neg_mean_absolute_percentage_error",
    "RMSE": "neg_root_mean_squared_error",
    "neg_root_mean_squared_error": "neg_root_mean_squared_error",
    "MAE": "neg_mean_absolute_error",
    "neg_mean_absolute_error": "neg_mean_absolute_error"
}

# The main routine

In [12]:
def predictability(data, input_cols=1, output_cols=1, col_set=None, primkey_cols=[], method="MLP", scoring="r2"):
    
    # TODO: map scoring to possible options
    scoring_dict = {
        "r2": "r2",
        "MAPE": "neg_mean_absolute_percentage_error",
        "neg_mean_absolute_percentage_error": "neg_mean_absolute_percentage_error",
        "RMSE": "neg_root_mean_squared_error",
        "neg_root_mean_squared_error": "neg_root_mean_squared_error",
        "MAE": "neg_mean_absolute_error",
        "neg_mean_absolute_error": "neg_mean_absolute_error"
    }
    scoring = scoring_dict[scoring]
    
    # if we want to measure the time
    start = time.time()
    
    # initialise the dictionary that is going to save the metrics per tuple
    metric_dict = {}
    
    # dict to save x-/y-train/-test and predicted values for subsequent plotting
    data_dict = {}
    
    # dict to save the plots
    plots_dict = {}
    
    # if primary keys are fed in, data columns should not contain these
    data_cols = [col for col in data.columns.to_list() if col not in primkey_cols]
    
    # if set of columns that should be considered is fed in, use this
    if col_set is not None:
        data_cols = list(set(col_set))
    
    # get the list of tuples of input and output columns
    data_tuples = list(itertools.combinations(data_cols, input_cols+output_cols))
    
    # for printing the progress of the analysis
    counter_tuples = 0
    
    # go through all tuples
    # or testing subset only:
    data_tuples = [("Electric power consumption (kWh per capita)", "Life expectancy at birth, total (years)")]+data_tuples[:5]
    #
    for curr_tuple in data_tuples:
        
        print("Analysing "+str(curr_tuple)+" now.")
        
        # TODO: implement going through all permutations
        
        # get current inputs and outputs
        curr_inputs = list(curr_tuple[:input_cols])
        curr_outputs = list(curr_tuple[input_cols:])
        
        # reduce data to current columns and drop NAs
        curr_data = data[curr_inputs+curr_outputs].dropna()
        
        # do data preparations and train-test-split
        curr_X_train, curr_X_test, curr_y_train, curr_y_test = data_prep_split(curr_data, curr_inputs, curr_outputs)
        
        # compute standard deviation of curr_y_test for later scaling of the RMSE
        curr_y_test_std = np.std(curr_y_test)
        
        #
        # linear regression
        #
        lin_reg = LinearRegression().fit(curr_X_train,curr_y_train)
        curr_y_test_pred = lin_reg.predict(curr_X_test)
        # metrics
        curr_lin_r2 = r2_score(curr_y_test, curr_y_test_pred)
        curr_lin_rmse = mean_squared_error(curr_y_test, curr_y_test_pred, squared=False)
        curr_lin_mape = mean_absolute_percentage_error(curr_y_test, curr_y_test_pred)
        curr_lin_rae = rae(curr_y_test, curr_y_test_pred)
        curr_lin_dcor = dcor.distance_correlation(curr_y_test, curr_y_test_pred)
        
        #
        # MLP regression
        #
        # list of hidden layer sizes for GridSearch
        hidden_layers = [(12,), 
                          (50,), 
                          (70,5,), 
                          (40,18,3,)
                        ]
        # list of alpha values for GridSearch
        alphas = [0.001, 0.0001, 0.00001]
        
        # standard approach
        # use scaler
        '''
        scaler = StandardScaler()  
        scaler.fit(curr_X_train)  
        curr_X_train = scaler.transform(curr_X_train)
        curr_X_test = scaler.transform(curr_X_test)
        
        # dict for standard GridSearch
        params = { "hidden_layer_sizes": hidden_layers}
        
        # standard GridSearchCV
        clf = GridSearchCV(estimator=MLPRegressor(max_iter=5000),
                           param_grid=params,
                           cv=3,
                           scoring=scoring,
                           return_train_score=True
                          )
        '''
        # via pipeline (with and without scaler)
        pipe = Pipeline([
                        ('scaler', StandardScaler()),
                        ('mlp', MLPRegressor(max_iter=5000))
                        ])
        pipe_params = [{'scaler': ['passthrough'],
                        'mlp__hidden_layer_sizes': hidden_layers,
                        'mlp__alpha': alphas}, 
                       {'mlp__hidden_layer_sizes': hidden_layers,
                        'mlp__alpha': alphas}]
        clf = GridSearchCV(pipe,
                           param_grid=pipe_params,
                           cv=3,
                           scoring=scoring,
                           return_train_score=True,
                           verbose=1
                          )
        
        clf.fit(curr_X_train, curr_y_train)
        
        curr_best_params = clf.best_params_
        curr_y_test_pred = clf.predict(curr_X_test)
        curr_mlp_r2 = r2_score(curr_y_test, curr_y_test_pred)
        
        #
        # now do the same but with previous outlier extraction
        #
        extr_curr_data, extr_curr_data_score, curr_outliers = extract_outliers(curr_data)
        
        # do data preparations and train-test-split
        extr_curr_X_train, extr_curr_X_test, extr_curr_y_train, extr_curr_y_test = data_prep_split(extr_curr_data, curr_inputs, curr_outputs)
        
        # compute standard deviation of curr_y_test for later scaling of the RMSE
        extr_curr_y_test_std = np.std(extr_curr_y_test)
        
        #
        # linear regression
        #
        extr_lin_reg = LinearRegression().fit(extr_curr_X_train, extr_curr_y_train)
        extr_curr_y_test_pred = extr_lin_reg.predict(extr_curr_X_test)
        # metrics
        extr_curr_lin_r2 = r2_score(extr_curr_y_test, extr_curr_y_test_pred)
        extr_curr_lin_rmse = mean_squared_error(extr_curr_y_test, extr_curr_y_test_pred, squared=False)
        extr_curr_lin_mape = mean_absolute_percentage_error(extr_curr_y_test, extr_curr_y_test_pred)
        extr_curr_lin_rae = rae(extr_curr_y_test, extr_curr_y_test_pred)
        extr_curr_lin_dcor = dcor.distance_correlation(extr_curr_y_test, extr_curr_y_test_pred)
        
        extr_clf = GridSearchCV(pipe,
                           param_grid=pipe_params,
                           cv=3,
                           scoring=scoring,
                           return_train_score=True,
                           verbose=1
                          )
        
        extr_clf.fit(extr_curr_X_train, extr_curr_y_train)
        
        extr_curr_best_params = extr_clf.best_params_
        extr_curr_y_test_pred = extr_clf.predict(extr_curr_X_test)
        extr_curr_mlp_r2 = r2_score(extr_curr_y_test, extr_curr_y_test_pred)
        
        print("r2 with outliers: "+str(curr_mlp_r2))
        print("r2 without outliers: "+str(extr_curr_mlp_r2))
        
        #
        # TODO: adapt it to scoring-input, i.e. compare r2 only if scoring="r2" etc.
        #
        if curr_mlp_r2>=extr_curr_mlp_r2:
            
            outlier_info = "included"
        
            # metrics
            curr_mlp_rmse = mean_squared_error(curr_y_test, curr_y_test_pred, squared=False)
            curr_mlp_mape = mean_absolute_percentage_error(curr_y_test, curr_y_test_pred)
            #curr_mlp_distcorr = distance.correlation(curr_y_test, curr_y_test_pred)
            curr_mlp_rae = rae(curr_y_test, curr_y_test_pred)
            curr_mlp_dcor = dcor.distance_correlation(curr_y_test, curr_y_test_pred)

            # save metrics into dict
            metric_dict[curr_tuple] = {"MLP r2": curr_mlp_r2, "linear r2": curr_lin_r2,
                                        "MLP RMSE": curr_mlp_rmse, "linear RMSE": curr_lin_rmse,
                                        "MLP RMSE/std": curr_mlp_rmse/curr_y_test_std, "linear RMSE/std": curr_lin_rmse/curr_y_test_std,
                                        "MLP MAPE": curr_mlp_mape, "linear MAPE": curr_lin_mape,
                                        #"MLP distcorr": curr_mlp_distcorr, "linear distcorr": curr_lin_distcorr,
                                        "MLP rae": curr_mlp_rae, "linear rae": curr_lin_rae,
                                        "MLP dcor": curr_mlp_dcor, "linear dcor": curr_lin_dcor,
            #"Spearman": curr_data[curr_inputs].corr(curr_data[curr_outputs], method="spearman"), 
            #"Pearson": curr_data[curr_inputs].corr(curr_data[curr_outputs], method="pearson")
                                               }

            # save values into dict
            # if standard approach with scaler chosen
            '''
            data_dict[curr_tuple] = {"X_train": scaler.inverse_transform(curr_X_train),
                                     "X_test": scaler.inverse_transform(curr_X_test),
                                     "y_train": curr_y_train, "y_test": curr_y_test, "y_test_pred": curr_y_test_pred,
                                     "GridSearchParams": curr_best_params, "scores": clf.cv_results_
                                    }
            '''
            # else
            data_dict[curr_tuple] = {"X_train": curr_X_train, "X_test": curr_X_test,
                                     "y_train": curr_y_train, "y_test": curr_y_test, "y_test_pred": curr_y_test_pred,
                                     "outliers": curr_outliers, "outlier_info": outlier_info,
                                     "GridSearchParams": curr_best_params, "scores": clf.cv_results_
                                    }

            # save plot into dict
            plots_dict[curr_tuple] = plot_2d_result(curr_tuple, metric_dict, data_dict)
        
        else:
            
            outlier_info = "excluded"
            
            # metrics
            extr_curr_mlp_rmse = mean_squared_error(extr_curr_y_test, extr_curr_y_test_pred, squared=False)
            extr_curr_mlp_mape = mean_absolute_percentage_error(extr_curr_y_test, extr_curr_y_test_pred)
            #extr_curr_mlp_distcorr = distance.correlation(extr_curr_y_test, extr_curr_y_test_pred)
            extr_curr_mlp_rae = rae(extr_curr_y_test, extr_curr_y_test_pred)
            extr_curr_mlp_dcor = dcor.distance_correlation(extr_curr_y_test, extr_curr_y_test_pred)

            # save metrics into dict
            metric_dict[curr_tuple] = {"MLP r2": extr_curr_mlp_r2, "linear r2": extr_curr_lin_r2,
                                        "MLP RMSE": extr_curr_mlp_rmse, "linear RMSE": extr_curr_lin_rmse,
                                        "MLP RMSE/std": extr_curr_mlp_rmse/extr_curr_y_test_std, "linear RMSE/std": extr_curr_lin_rmse/extr_curr_y_test_std,
                                        "MLP MAPE": extr_curr_mlp_mape, "linear MAPE": extr_curr_lin_mape,
                                        #MLP distcorr": extr_curr_mlp_distcorr, "linear distcorr": extr_curr_lin_distcorr,
                                        "MLP rae": extr_curr_mlp_rae, "linear rae": extr_curr_lin_rae,
                                        "MLP dcor": extr_curr_mlp_dcor, "linear dcor": extr_curr_lin_dcor,
            #"Spearman": curr_data[curr_inputs].corr(curr_data[curr_outputs], method="spearman"), 
            #"Pearson": curr_data[curr_inputs].corr(curr_data[curr_outputs], method="pearson")
                                               }

            # save values into dict
            # if standard approach with scaler chosen
            '''
            data_dict[curr_tuple] = {"X_train": scaler.inverse_transform(curr_X_train),
                                     "X_test": scaler.inverse_transform(curr_X_test),
                                     "y_train": curr_y_train, "y_test": curr_y_test, "y_test_pred": curr_y_test_pred,
                                     "GridSearchParams": curr_best_params, "scores": clf.cv_results_
                                    }
            '''
            # else
            data_dict[curr_tuple] = {"X_train": extr_curr_X_train, "X_test": extr_curr_X_test,
                                     "y_train": extr_curr_y_train, "y_test": extr_curr_y_test, "y_test_pred": extr_curr_y_test_pred,
                                     "outliers": curr_outliers, "outlier_info": outlier_info,
                                     "GridSearchParams": extr_curr_best_params, "scores": extr_clf.cv_results_
                                    }

            # save plot into dict
            plots_dict[curr_tuple] = plot_2d_result(curr_tuple, metric_dict, data_dict)
        
        # for printing the CV results per tuple
        #print(clf.cv_results_)
        
        # for printing the progress of the analysis
        counter_tuples += 1
        print("-----"+str(counter_tuples)+"/"+str(len(data_tuples))+"-----")
    
    print("This took "+str(round(time.time()-start,2))+"s.")
    
    return metric_dict, data_dict, plots_dict

In [None]:
metrics, datas, plots = predictability(data=df2007,
                                primkey_cols = prim_keys,
                                scoring="r2"
                               )

In [15]:
metrics_df = pd.DataFrame.from_dict(metrics).transpose()
metrics_df

Unnamed: 0,Unnamed: 1,MLP r2,linear r2,MLP RMSE,linear RMSE,MLP RMSE/std,linear RMSE/std,MLP MAPE,linear MAPE,MLP rae,linear rae,MLP dcor,linear dcor
Electric power consumption (kWh per capita),"Life expectancy at birth, total (years)",0.584477,0.283351,5.359802,7.038897,0.644611,0.846552,0.062189,0.08853,0.583697,0.84346,0.799535,0.691547
"Agriculture, value added (% of GDP)",CO2 emissions (metric tons per capita),0.62026,0.522675,2.145106,2.404986,0.616231,0.690887,1.924978,2.783326,0.535534,0.646421,0.797036,0.76453
"Agriculture, value added (% of GDP)",Domestic credit provided by financial sector (% of GDP),0.376935,0.237257,52.245057,57.805316,0.789345,0.873352,1.591525,2.143466,0.674773,0.753938,0.649071,0.637954
"Agriculture, value added (% of GDP)",Electric power consumption (kWh per capita),0.233249,0.168928,5430.373169,5653.554914,0.875643,0.911631,0.826794,5.96387,0.615025,0.827924,0.684129,0.629361
"Agriculture, value added (% of GDP)",Energy use (kg of oil equivalent per capita),0.483398,0.31373,1105.570343,1274.252621,0.718751,0.828414,0.394668,0.68023,0.575195,0.729278,0.739615,0.670562
"Agriculture, value added (% of GDP)",Exports of goods and services (% of GDP),0.123053,0.168831,14.031018,13.659894,0.936454,0.911685,0.329588,0.321846,0.954032,0.923157,0.38558,0.42891


In [16]:
datas[('Electric power consumption (kWh per capita)', 'Life expectancy at birth, total (years)')]

{'X_train': array([[8.61449128e+03],
        [1.76584955e+03],
        [3.68525449e+04],
        [1.30797583e+02],
        [1.68605341e+04],
        [1.15998866e+03],
        [1.54810467e+03],
        [8.47437911e+03],
        [2.13776499e+03],
        [4.94956315e+02],
        [2.55669391e+03],
        [6.56464380e+03],
        [7.74033185e+01],
        [2.25987421e+03],
        [2.48550005e+04],
        [5.56022186e+01],
        [1.44883806e+02],
        [6.66839027e+03],
        [4.15913723e+03],
        [1.17543801e+04],
        [9.29474697e+03],
        [6.21876806e+03],
        [2.07132310e+02],
        [1.66402412e+03],
        [5.46335479e+02],
        [2.49133824e+02],
        [4.89056019e+03],
        [1.45225892e+02],
        [4.96530893e+02],
        [7.38471951e+02],
        [1.09728917e+04],
        [2.79677077e+03],
        [1.21312437e+03],
        [1.53305258e+04],
        [7.67955066e+02],
        [6.95007399e+02],
        [7.51906208e+03],
        [1.63523218e+04],
 

In [19]:
plots[list(plots.keys())[-1]]

In [18]:
# compare with a backup run, loadede at end of notebook
plots_backup[list(plots_backup.keys())[2]]

### Run the plotting routine alone

In [None]:
plot_2d_result(list(datas.keys())[0], metrics, datas, show=True)

### save dicts

In [None]:
with open('metrics.pkl', 'wb') as f:
    pickle.dump(metrics, f)
with open('datas.pkl', 'wb') as f:
    pickle.dump(datas, f)
with open('plots.pkl', 'wb') as f:
    pickle.dump(plots, f)

### load dicts

In [13]:


with open('metrics.pkl', 'rb') as f:
    metrics = pickle.load(f)
with open('datas.pkl', 'rb') as f:
    datas = pickle.load(f)
with open('plots.pkl', 'rb') as f:
    plots = pickle.load(f)


#### or backups

In [14]:


with open('metrics_backup.pkl', 'rb') as f:
    metrics_backup = pickle.load(f)
with open('datas_backup.pkl', 'rb') as f:
    datas_backup = pickle.load(f)
with open('plots_backup.pkl', 'rb') as f:
    plots_backup = pickle.load(f)
