In [1]:
import datetime

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor

import warnings
warnings.filterwarnings("ignore")

In [2]:
def split(df: pd.DataFrame, n: int = 2) -> list:
    return np.array_split(df, n)


def blend_and_split(df: pd.DataFrame, n: int = 2, frac: int = 0.5, seed: int = 42) -> list:
    return list(df.sample(frac=frac, replace=True, random_state=seed) for i in range(0, n))


def init_svm():
    return SVR(kernel="rbf")


def init_sgd():
    return SGDRegressor(loss='squared_loss', alpha=0.0001)


def init_dnn():
    # TODO: implement
    raise Exception(f'Not implemented')


def init_model(model_type):
    """
    :param model_type: 'svm', 'neural_network'
    :return: model instance
    """
    # вибір типу та ініціалізація моделі
    if model_type == 'svm':
        model = init_svm()
    elif model_type == 'sgd':
        model = init_sgd()
    elif model_type == 'dnn':
        model = init_dnn()
    else:
        raise Exception(f'Unknown model type: {model_type}')

    return model


def init_metrics():
    return {
        'train': {
            'explained_variance_score': [],
            'max_error': [],
            'mean_absolute_error': [],
            'mean_squared_error': [],
            'root_mean_squared_error': [],
            'mean_absolute_percentage_error': [],
            'median_absolute_error': [],
            'r2_score': []
        },
        'test': {
            'explained_variance_score': [],
            'max_error': [],
            'mean_absolute_error': [],
            'mean_squared_error': [],
            'root_mean_squared_error': [],
            'mean_absolute_percentage_error': [],
            'median_absolute_error': [],
            'r2_score': []
        }
    }

def calc_train_metrics(metrics, train_y, train_pred):
    metrics['train']['explained_variance_score'].append(explained_variance_score(train_y, train_pred))
    metrics['train']['max_error'].append(max_error(train_y, train_pred))
    metrics['train']['mean_absolute_error'].append(mean_absolute_error(train_y, train_pred))
    metrics['train']['mean_squared_error'].append(mean_squared_error(train_y, train_pred, squared=True))
    metrics['train']['root_mean_squared_error'].append(mean_squared_error(train_y, train_pred, squared=False))
    metrics['train']['mean_absolute_percentage_error'].append(mean_absolute_percentage_error(train_y, train_pred)*100)
    metrics['train']['median_absolute_error'].append(median_absolute_error(train_y, train_pred))
    metrics['train']['r2_score'].append(r2_score(train_y, train_pred))
    
def calc_test_metrics(metrics, test_y, test_pred):
    metrics['test']['explained_variance_score'].append(explained_variance_score(test_y, test_pred))
    metrics['test']['max_error'].append(max_error(test_y, test_pred))
    metrics['test']['mean_absolute_error'].append(mean_absolute_error(test_y, test_pred))
    metrics['test']['mean_squared_error'].append(mean_squared_error(test_y, test_pred, squared=True))
    metrics['test']['root_mean_squared_error'].append(mean_squared_error(test_y, test_pred, squared=False))
    metrics['test']['mean_absolute_percentage_error'].append(mean_absolute_percentage_error(test_y, test_pred)*100)
    metrics['test']['median_absolute_error'].append(median_absolute_error(test_y, test_pred))
    metrics['test']['r2_score'].append(r2_score(test_y, test_pred))
    
def scale(scaler, train_X, test_X):
    scaler.fit(train_X)

    train_X = pd.DataFrame(scaler.transform(train_X))
    test_X = pd.DataFrame(scaler.transform(test_X))

    return train_X, test_X

def add_poly_features(poly, train_X, test_X):
    train_X = pd.DataFrame(poly.fit_transform(train_X))
    test_X  = pd.DataFrame(poly.fit_transform(test_X))

    return train_X, test_X

def print_pretty(d, indent=0):
    for key, value in d.items():
        print('\t' * indent + str(key))
        if isinstance(value, dict):
            print_pretty(value, indent+1)
        else:
            print('\t' * (indent+2) + str(value))

In [3]:
def run_experiment(model_type:      str,  # 'dnn'
                   train_datasets:  list,
                   test_dataset:    pd.DataFrame,
                   n_pfeatures:     int,  # 0 -> do not generate polynomial features
                   export_to_excel: bool=True,
                   keep_last:       bool=True):
    
    """
    Note: number of cascades is set implicitly by the number of train data sets
    """
    if n_pfeatures > 0:
        poly = PolynomialFeatures(n_pfeatures)
    test_preds = []
    metrics = init_metrics()
    models = []

    for cascade in range(len(train_datasets)):
        print(f'cascade-{cascade}')
        train_dataset = train_datasets[cascade].copy()
        test_dataset_loc = test_dataset.copy()
        train_X = train_dataset.iloc[:, :-1]
        train_y = train_dataset.iloc[:,-1]
        test_X = test_dataset_loc.iloc[:, :-1]
        test_y = test_dataset_loc.iloc[:,-1]
        test_y_hist = test_dataset_loc.iloc[:,-1]
        model = init_model(model_type)
        
        # генеруємо і додаємо y_pred_i до фіч
        for i in range(cascade):
            train_X_alt = train_X.copy()
            test_X_alt = test_X.copy()
            
            if n_pfeatures > 0:
                train_X_alt, test_X_alt = add_poly_features(poly, train_X_alt, test_X_alt)
            train_X_alt, test_X_alt = scale(MinMaxScaler(), train_X_alt, test_X_alt)
            
            if keep_last:
                train_X['y_pred'] = models[i].predict(train_X_alt)
                test_X['y_pred'] = models[i].predict(test_X_alt)
            else:
                train_X[f'y_pred_{i+1}'] = models[i].predict(train_X_alt)
                test_X[f'y_pred_{i+1}'] = models[i].predict(test_X_alt)
        
        if n_pfeatures > 0:
            train_X, test_X = add_poly_features(poly, train_X, test_X)
        train_X, test_X = scale(MinMaxScaler(), train_X, test_X)
        
        model.fit(train_X, train_y)
        models.append(model)

        # рахуємо та зберігаємо метрики
        test_pred = model.predict(test_X)
        calc_train_metrics(metrics, train_y, model.predict(train_X))
        calc_test_metrics(metrics, test_y, test_pred)
        test_preds.append(test_pred)
    
    if export_to_excel:
        date_time_now = datetime.datetime.now()
        metrics_train_df = pd.DataFrame(data=metrics['train'])
        metrics_train_df.index.name = 'cascade'
        metrics_test_df = pd.DataFrame(data=metrics['test'])
        metrics_test_df.index.name = 'cascade'
        test_preds_df = pd.DataFrame(data=test_preds)
        
        with pd.ExcelWriter( 'experiment-run-{date_time_now}.xlsx') as writer:  
            metrics_train_df.to_excel(writer, sheet_name='train')
            metrics_test_df.to_excel(writer, sheet_name='test')
            test_preds_df.T.to_excel(writer, sheet_name='test-preds')
        
    return metrics

# Experiment 1

In [4]:
# df_train = pd.read_csv('trainCO.txt', header=None)
# df_test = pd.read_csv('testCO.txt', header=None)
df_train = pd.read_csv('c:/Users/ivani/Documents/sgtm/Datasets/heart_train.txt', header=None)
df_test = pd.read_csv('c:/Users/ivani/Documents/sgtm/Datasets/heart_test.txt', header=None)

In [5]:
train_X = df_train.iloc[:, :-1]
train_y = df_train.iloc[:,-1]
test_X = df_test.iloc[:, :-1]
test_y = df_test.iloc[:,-1]

metrics = init_metrics()
poly = PolynomialFeatures(2)
train_X, test_X = add_poly_features(poly, train_X, test_X)
train_X, test_X = scale(MinMaxScaler(), train_X, test_X)
model = init_model('sgd')

model.fit(train_X, train_y)

calc_train_metrics(metrics, train_y, model.predict(train_X))
calc_test_metrics(metrics, test_y, model.predict(test_X))

print_pretty(metrics)

train
	explained_variance_score
			[0.9931599660082169]
	max_error
			[6.521206145553137]
	mean_absolute_error
			[0.5671231378695447]
	mean_squared_error
			[0.7382032007190803]
	root_mean_squared_error
			[0.8591875236053421]
	mean_absolute_percentage_error
			[0.7734857774470688]
	median_absolute_error
			[0.3855299530129237]
	r2_score
			[0.9930667444613798]
test
	explained_variance_score
			[0.9931894413043391]
	max_error
			[6.495123015513073]
	mean_absolute_error
			[0.5680881770731935]
	mean_squared_error
			[0.7435188245203438]
	root_mean_squared_error
			[0.862275376269289]
	mean_absolute_percentage_error
			[0.7723408811818248]
	median_absolute_error
			[0.3852667542737862]
	r2_score
			[0.9931005912498252]


## Plain Split

In [6]:
%%time

train_split_plain = split(df_train, n=5)

print('len train_split_plain:', len(train_split_plain))

metrics = run_experiment(model_type = 'sgd', # TODO: 'dnn'
                         train_datasets = train_split_plain,
                         test_dataset = df_test,
                         n_pfeatures = 2)
print_pretty(metrics)

len train_split_plain: 5
cascade-0
cascade-1
cascade-2
cascade-3
cascade-4
train
	explained_variance_score
			[0.9899877027859189, 0.997485900993333, 0.9977109710641106, 0.9976870245047024, 0.9976625684748894]
	max_error
			[7.68661200273813, 8.14573647422418, 6.494146300992114, 5.920829597661566, 6.138576572441124]
	mean_absolute_error
			[0.6788192825829829, 0.3436198443255379, 0.32731204087839294, 0.33383917587611195, 0.3353535282405324]
	mean_squared_error
			[1.0555388758557678, 0.2685679667771005, 0.24484040681309396, 0.24749095371526764, 0.24917448319644647]
	root_mean_squared_error
			[1.0273942163822842, 0.5182354356632712, 0.4948135071045393, 0.4974846266119865, 0.499173800590983]
	mean_absolute_percentage_error
			[0.925742797646738, 0.45965831361157605, 0.4374548228094446, 0.4480740140737604, 0.449326533272543]
	median_absolute_error
			[0.46464022035887353, 0.2414481765620664, 0.22580480553735427, 0.22902049307607797, 0.23074238510374556]
	r2_score
			[0.989987669435303, 0

## Blended Split

# Experiment 2

## Baseline

## Plain Split