# Prerrequisitos

In [1]:
from joblib import Parallel, delayed
from sktime.datasets import load_from_tsfile_to_dataframe
from sktime.datatypes._panel._convert import from_nested_to_long, from_nested_to_multi_index
from collections import defaultdict
import numpy as np
import pandas as pd
import pickle

SEED = 1
J_INS_GRID = [20, 40, 60, 80, 100]
N_SYMBOLS_GRID = [20, 40, 60, 80, 100]
PKL_DIR = 'pkl/baydogan_examples/'

def validate(
        clf,
        X_train,
        X_test,
        y_train,
        y_test,
        n_times=10,
        id_col_name='id',
        n_jobs=-2):
    return Parallel(n_jobs=n_jobs)(delayed(__parallel_validate)(
        clf.clone(),
        X_train,
        X_test,
        y_train,
        y_test,
        id_col_name=id_col_name
    ) for _ in range(0, n_times))

def __parallel_validate(
        clf,
        X_train,
        X_test,
        y_train,
        y_test,
        id_col_name='id'):
    return_dict = {}
    clf.fit(X_train, y_train, id_col_name=id_col_name)
    
    return_dict['score'] = clf.score(
        X_test, y_test, id_col_name=id_col_name)
    return_dict['oob_score_'] = clf.oob_score_

    return return_dict

def get_best_score(on_dicted_results):
    best_score_mean = -np.inf
    best_model = None
    
    for model in on_dicted_results.keys():
        score_mean = on_dicted_results[model]['score_mean']
        
        if score_mean > best_score_mean:
            best_score_mean = score_mean
            best_model = model
            
    return best_score_mean, best_model


def trend_fn(
        df,
        id_col_name='id',
        time_col_name='time'):
    columns = __trend_setup_cols(
        df,
        id_col_name=id_col_name,
        time_col_name=time_col_name
    )
    data_with_features_diff = defaultdict(list)

    previous_sample = None
    for sample in df.iloc:
        if previous_sample is None:
            previous_sample = sample
            continue

        data_with_features_diff[columns[0]].append(sample[id_col_name])
        data_with_features_diff[columns[1]].append(sample[time_col_name])

        for i in range(2, len(columns), 2):
            data_with_features_diff[columns[i]].append(
                sample[columns[i]])
            data_with_features_diff[columns[i+1]].append(
                sample[columns[i]] - previous_sample[columns[i]])

        previous_sample = sample

    return pd.DataFrame(data_with_features_diff)

def __trend_setup_cols(
        df,
        id_col_name='id',
        time_col_name='time'):
    final_columns = []

    columns = list(df.columns)
    columns.remove(id_col_name)
    columns.remove(time_col_name)

    final_columns.append(id_col_name)
    final_columns.append(time_col_name)
    
    for attr in columns:
        final_columns.append(attr)
        final_columns.append(attr + 'diff')

    return final_columns

def parse_into_train_test(
        train_path,
        test_path,
        id_col_name='index',
        time_col_name='time_index',
        multivariate=False):
    X_train, tmp_y_train =\
        load_from_tsfile_to_dataframe(train_path)
    X_test, tmp_y_test =\
        load_from_tsfile_to_dataframe(test_path)
        
    if not multivariate:
        X_train = from_nested_to_long(X_train).drop(
            ['column'], axis=1)
        X_test = from_nested_to_long(X_test).drop(
            ['column'], axis=1)
    else:
        X_train = from_nested_to_multi_index(
            X_train,
            instance_index=id_col_name,
            time_index=time_col_name
        )
        X_train = X_train.reset_index()
        X_test = from_nested_to_multi_index(
            X_test,
            instance_index=id_col_name,
            time_index=time_col_name
        )
        X_test = X_test.reset_index()

    X_train, X_test = apply_trend_Fn(X_train, X_test)
    
    y_train = np.asarray([tmp_y_train[int(index)]
                          for index in X_train[id_col_name]])
    y_test = np.asarray([tmp_y_test[int(index)]
                         for index in X_test[id_col_name]])

    return X_train, X_test, y_train, y_test

def apply_trend_Fn(
        X_train,
        X_test,
        id_col_name='index',
        time_col_name='time_index'):
    new_X_train = pd.DataFrame()
    new_X_test = pd.DataFrame()

    for serie_id in pd.unique(X_train[id_col_name]):
        serie = X_train[X_train[id_col_name] == serie_id]
        codificated_serie = trend_fn(
            serie, id_col_name=id_col_name, time_col_name=time_col_name)
        new_X_train = new_X_train.append(codificated_serie)

    for serie_id in pd.unique(X_test[id_col_name]):
        serie = X_test[X_test[id_col_name] == serie_id]
        codificated_serie = trend_fn(
            serie, id_col_name=id_col_name, time_col_name=time_col_name)
        new_X_test = new_X_test.append(codificated_serie)
        
    return new_X_train, new_X_test


# Objetivos
- Este librillo intentará reproducir los resultados obtenidos en los diferentes datasets en que se probó este algoritmo (mencionados en las páginas 12-18 del paper de Mustafa Baydogan).

## Adiac Dataset
- Error de Baydogan: 0.248

In [6]:
X_train, X_test, y_train, y_test = parse_into_train_test(
    'data/Adiac/Adiac_TRAIN.ts',
    'data/Adiac/Adiac_TEST.ts')

X_train = X_train.drop('time_index', axis=1)
X_test = X_test.drop('time_index', axis=1)

In [7]:
from smts import SMTS

all_clf_used = {}
for j_ins in J_INS_GRID:
    for n_symbols in N_SYMBOLS_GRID:
        clf_results = {}
        clf = SMTS(
            j_ins=j_ins,
            n_symbols=n_symbols
        )

        results = validate(
            clf,
            X_train,
            X_test,
            y_train,
            y_test,
            n_times=10,
            id_col_name='index',
            n_jobs=-2
        )

        clf_scores = [result['score'] for result in results]
        clf_results['score_mean'] = np.mean(clf_scores)
        clf_results['score_std'] = np.std(clf_scores)

        clf_oob_scores = [result['oob_score_'] for result in results]
        clf_results['oob_score_mean'] = np.mean(clf_oob_scores)
        clf_results['oob_score_std'] = np.std(clf_oob_scores)

        all_clf_used[(j_ins, n_symbols)] = clf_results
        print((j_ins, n_symbols))

with open(PKL_DIR + 'adiac', 'wb') as file:
    pickle.dump(all_clf_used, file)


In [8]:
with open(PKL_DIR + 'adiac', 'rb') as file:
    adiac_clfs = pickle.load(file)

In [9]:
best_score_model = get_best_score(adiac_clfs)
1-best_score_model[0], best_score_model[1]

(0.2053708439897698, (60, 80))

## Beef
- Error de Baydogan: 0.26
- La dimensión temporal aquí importa porque 

In [46]:
X_train, X_test, y_train, y_test = parse_into_train_test(
    'data/Beef/Beef_TRAIN.ts',
    'data/Beef/Beef_TEST.ts')

X_train = X_train.drop('time_index', axis=1)
X_test = X_test.drop('time_index', axis=1)


In [41]:
for x in pd.unique(X_train['index']):
    if (list(X_train[X_train['index'] == 0]['time_index']) != list(X_train[X_train['index'] == x]['time_index'])):
        print("LMOA")


In [47]:
from smts import SMTS

all_clf_used = {}
for j_ins in J_INS_GRID:
    for n_symbols in N_SYMBOLS_GRID:
        clf_results = {}
        clf = SMTS(
            j_ins=j_ins,
            n_symbols=n_symbols
        )

        results = validate(
            clf,
            X_train,
            X_test,
            y_train,
            y_test,
            n_times=10,
            id_col_name='index',
            n_jobs=-2
        )

        clf_scores = [result['score'] for result in results]
        clf_results['score_mean'] = np.mean(clf_scores)
        clf_results['score_std'] = np.std(clf_scores)

        clf_oob_scores = [result['oob_score_'] for result in results]
        clf_results['oob_score_mean'] = np.mean(clf_oob_scores)
        clf_results['oob_score_std'] = np.std(clf_oob_scores)

        all_clf_used[(j_ins, n_symbols)] = clf_results
        print((j_ins, n_symbols))

with open(PKL_DIR + 'beef', 'wb') as file:
    pickle.dump(all_clf_used, file)


(20, 20)
(20, 40)
(20, 60)
(20, 80)
(20, 100)
(40, 20)
(40, 40)
(40, 60)
(40, 80)
(40, 100)
(60, 20)
(60, 40)
(60, 60)
(60, 80)
(60, 100)
(80, 20)
(80, 40)
(80, 60)
(80, 80)
(80, 100)
(100, 20)
(100, 40)
(100, 60)
(100, 80)
(100, 100)


In [48]:
with open(PKL_DIR + 'beef', 'rb') as file:
    beef_clfs = pickle.load(file)

In [None]:
best_score_model = get_best_score(beef_clfs)
1-best_score_model[0], best_score_model[1]

## OSU Leaf
- Error de Baydogan: 0.377

In [6]:
X_train, X_test, y_train, y_test = parse_into_train_test(
    'data/OSULeaf/OSULeaf_TRAIN.ts',
    'data/OSULeaf/OSULeaf_TEST.ts')

X_train = X_train.drop('time_index', axis=1)
X_test = X_test.drop('time_index', axis=1)


In [7]:
from smts import SMTS

all_clf_used = {}
for j_ins in J_INS_GRID:
    for n_symbols in N_SYMBOLS_GRID:
        clf_results = {}
        clf = SMTS(
            j_ins=j_ins,
            n_symbols=n_symbols
        )

        results = validate(
            clf,
            X_train,
            X_test,
            y_train,
            y_test,
            n_times=10,
            id_col_name='index',
            n_jobs=-2
        )

        clf_scores = [result['score'] for result in results]
        clf_results['score_mean'] = np.mean(clf_scores)
        clf_results['score_std'] = np.std(clf_scores)

        clf_oob_scores = [result['oob_score_'] for result in results]
        clf_results['oob_score_mean'] = np.mean(clf_oob_scores)
        clf_results['oob_score_std'] = np.std(clf_oob_scores)

        all_clf_used[(j_ins, n_symbols)] = clf_results
        print((j_ins, n_symbols))

with open(PKL_DIR + 'osuleaf', 'wb') as file:
    pickle.dump(all_clf_used, file)


(20, 20)
(20, 40)
(20, 60)
(20, 80)
(20, 100)
(40, 20)
(40, 40)
(40, 60)
(40, 80)
(40, 100)
(60, 20)
(60, 40)
(60, 60)
(60, 80)
(60, 100)
(80, 20)
(80, 40)
(80, 60)
(80, 80)
(80, 100)
(100, 20)
(100, 40)
(100, 60)
(100, 80)
(100, 100)


In [8]:
with open(PKL_DIR + 'osuleaf', 'rb') as file:
    osuleaf_clfs = pickle.load(file)


In [9]:
best_score_model = get_best_score(osuleaf_clfs)
1-best_score_model[0], best_score_model[1]


(0.32272727272727264, (80, 40))

## CBF
- Error de Baydogan: 0.020
- De nuevo ha aumentado mucho el error -> Aquí importa el índice temporal porque añade un offset a las series (puede influir entonces el momento en que ocurre uno u otro evento).

In [55]:
X_train, X_test, y_train, y_test = parse_into_train_test(
    'data/CBF/CBF_TRAIN.ts',
    'data/CBF/CBF_TEST.ts')

# X_train = X_train.drop('time_index', axis=1)
# X_test = X_test.drop('time_index', axis=1)


In [56]:
for x in pd.unique(X_train['index']):
    if (list(X_train[X_train['index'] == 0]['time_index']) != list(X_train[X_train['index'] == x]['time_index'])):
        print("LMOA")


In [None]:
from smts import SMTS

all_clf_used = {}
for j_ins in J_INS_GRID:
    for n_symbols in N_SYMBOLS_GRID:
        clf_results = {}
        clf = SMTS(
            j_ins=j_ins,
            n_symbols=n_symbols
        )

        results = validate(
            clf,
            X_train,
            X_test,
            y_train,
            y_test,
            n_times=10,
            id_col_name='index',
            n_jobs=-2
        )

        clf_scores = [result['score'] for result in results]
        clf_results['score_mean'] = np.mean(clf_scores)
        clf_results['score_std'] = np.std(clf_scores)

        clf_oob_scores = [result['oob_score_'] for result in results]
        clf_results['oob_score_mean'] = np.mean(clf_oob_scores)
        clf_results['oob_score_std'] = np.std(clf_oob_scores)

        all_clf_used[(j_ins, n_symbols)] = clf_results
        print((j_ins, n_symbols))

with open(PKL_DIR + 'cbf', 'wb') as file:
    pickle.dump(all_clf_used, file)


In [None]:
with open(PKL_DIR + 'cbf', 'rb') as file:
    cbf_clfs = pickle.load(file)


In [None]:
best_score_model = get_best_score(cbf_clfs)
1-best_score_model[0], best_score_model[1]


## Japanese Vowels
- Error de Baydogan: 0.031

In [14]:
X_train, X_test, y_train, y_test = parse_into_train_test(
    'data/JapaneseVowels/JapaneseVowels_TRAIN.ts',
    'data/JapaneseVowels/JapaneseVowels_TEST.ts',
    multivariate=True)

X_train = X_train.drop('time_index', axis=1)
X_test = X_test.drop('time_index', axis=1)


In [15]:
from smts import SMTS

all_clf_used = {}
for j_ins in J_INS_GRID:
    for n_symbols in N_SYMBOLS_GRID:
        clf_results = {}
        clf = SMTS(
            j_ins=j_ins,
            n_symbols=n_symbols
        )

        results = validate(
            clf,
            X_train,
            X_test,
            y_train,
            y_test,
            n_times=10,
            id_col_name='index',
            n_jobs=-2
        )

        clf_scores = [result['score'] for result in results]
        clf_results['score_mean'] = np.mean(clf_scores)
        clf_results['score_std'] = np.std(clf_scores)

        clf_oob_scores = [result['oob_score_'] for result in results]
        clf_results['oob_score_mean'] = np.mean(clf_oob_scores)
        clf_results['oob_score_std'] = np.std(clf_oob_scores)

        all_clf_used[(j_ins, n_symbols)] = clf_results
        print((j_ins, n_symbols))

with open(PKL_DIR + 'japanese_vowels', 'wb') as file:
    pickle.dump(all_clf_used, file)


(20, 20)
(20, 40)
(20, 60)
(20, 80)
(20, 100)
(40, 20)
(40, 40)
(40, 60)
(40, 80)
(40, 100)
(60, 20)
(60, 40)
(60, 60)
(60, 80)
(60, 100)
(80, 20)
(80, 40)
(80, 60)
(80, 80)
(80, 100)
(100, 20)
(100, 40)
(100, 60)
(100, 80)
(100, 100)


In [16]:
with open(PKL_DIR + 'japanese_vowels', 'rb') as file:
    japanese_vowels_clfs = pickle.load(file)


In [17]:
best_score_model = get_best_score(japanese_vowels_clfs)
1-best_score_model[0], best_score_model[1]


(0.029189189189189113, (20, 80))

## Libras
- Error de Baydogan: 0.091

In [18]:
X_train, X_test, y_train, y_test = parse_into_train_test(
    'data/Libras/Libras_TRAIN.ts',
    'data/Libras/Libras_TEST.ts',
    multivariate=True)

X_train = X_train.drop('time_index', axis=1)
X_test = X_test.drop('time_index', axis=1)


In [19]:
from smts import SMTS

all_clf_used = {}
for j_ins in J_INS_GRID:
    for n_symbols in N_SYMBOLS_GRID:
        clf_results = {}
        clf = SMTS(
            j_ins=j_ins,
            n_symbols=n_symbols
        )

        results = validate(
            clf,
            X_train,
            X_test,
            y_train,
            y_test,
            n_times=10,
            id_col_name='index',
            n_jobs=-2
        )

        clf_scores = [result['score'] for result in results]
        clf_results['score_mean'] = np.mean(clf_scores)
        clf_results['score_std'] = np.std(clf_scores)

        clf_oob_scores = [result['oob_score_'] for result in results]
        clf_results['oob_score_mean'] = np.mean(clf_oob_scores)
        clf_results['oob_score_std'] = np.std(clf_oob_scores)

        all_clf_used[(j_ins, n_symbols)] = clf_results
        print((j_ins, n_symbols))

with open(PKL_DIR + 'libras', 'wb') as file:
    pickle.dump(all_clf_used, file)


(20, 20)
(20, 40)
(20, 60)
(20, 80)
(20, 100)
(40, 20)
(40, 40)
(40, 60)
(40, 80)
(40, 100)
(60, 20)
(60, 40)
(60, 60)
(60, 80)
(60, 100)
(80, 20)
(80, 40)
(80, 60)
(80, 80)
(80, 100)
(100, 20)
(100, 40)
(100, 60)
(100, 80)
(100, 100)


In [20]:
with open(PKL_DIR + 'libras', 'rb') as file:
    libras_clfs = pickle.load(file)


In [21]:
best_score_model = get_best_score(libras_clfs)
1-best_score_model[0], best_score_model[1]


(0.10055555555555562, (100, 40))