## <a id = '0'> Índice </a>

* [**Entorno**](#1)  
   * [Librerías](#1d1)  
   * [Funciones](#1d2)  
   * [Constantes](#1d3)

* [**Lectura de datos**](#2)


## <a id = '1'> Entorno </a>
[índice](#0)

### <a id = '1d1'> Librerías </a>

In [1]:
import os
import pandas as pd
import numpy as np

from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox

from statsmodels.tsa.arima.model import ARIMA

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX

import matplotlib.pyplot as plt

import math
import itertools
# from config import data_folder

from scipy.stats import kurtosis, trim_mean


In [2]:
os.chdir("../")

### <a id = '1d2'> Funciones </a>

In [3]:
from src.utils import (lectura_carpetas_dict
                       , split_dataframe_into_chunks
                       , genera_df_acf_pacf_chunk
                       , get_dict_serie_summary_chunk
                       , patients_dict_ccf_chunk
                       , get_ccf_summary_chunk
                       )

### <a id = '1d3'> Constantes </a>

In [4]:
from config.cons import data_folder, list_signals

## <a id = '2'> Lectura de datos </a>
[índice](#0)

In [5]:
proyect_path = os.getcwd()
data_path = proyect_path + data_folder

: 

In [7]:
# all_combinaciones = list(itertools.combinations(list_signals, 2))
all_combinaciones = list(itertools.combinations_with_replacement(list_signals, 2))


In [7]:
uni_combinacion = []
bi_combinacion = []
for col1, col2 in all_combinaciones:
    if col1 == col2:
        uni_combinacion.append(col1)
    else:
        bi_combinacion.append(col1 + "_" + col2)
dict_combinaciones = {
    "uni_combinacion": uni_combinacion,
    "bi_combinacion": bi_combinacion
}   

In [8]:
df_all = lectura_carpetas_dict(data_path)

In [9]:
df_mi = df_all["mi"]
df_sttc_mi = df_all["sttc_mi"]
df_sttc = df_all["sttc"]
df_other = df_all["other"]

In [10]:
dict_mi = {patient: split_dataframe_into_chunks(df_mi[patient], 100) for patient in df_mi.keys()}
df_mi_acf_pacf = genera_df_acf_pacf_chunk(dict_mi, list_signals, apply_diff=True)
df_mi_peak = get_dict_serie_summary_chunk(dict_mi, list_signals, period = 10, n_std = 1)
df_mi_ccf = patients_dict_ccf_chunk(dict_mi, all_combinaciones, max_lag=25)
mi_cff_features = get_ccf_summary_chunk(df_mi_ccf, dict_combinaciones, proportion_to_cut = 0.05)

In [11]:
df_mi_features = (
    df_mi_acf_pacf
    .merge(df_mi_peak, on=["patient_id", "label"], how="inner")
    .merge(mi_cff_features, on=["patient_id", "label"], how="inner")
)

cols = ['patient_id', 'label', 'chunk'] + [col for col in df_mi_features.columns if col not in ('patient_id', 'label', 'chunk')]

df_mi_features = df_mi_features[cols]

In [26]:
df_mi_features.to_csv("output/chunk_data/chunk_100/features/mi_features.csv", index=False)

In [13]:
del dict_mi, df_mi_acf_pacf, df_mi_peak, mi_cff_features

In [14]:
dict_sttc_mi = {patient: split_dataframe_into_chunks(df_sttc_mi[patient], 100) for patient in df_sttc_mi.keys()}
df_sttc_mi_acf_pacf = genera_df_acf_pacf_chunk(dict_sttc_mi, list_signals, apply_diff=True)
df_sttc_mi_peak = get_dict_serie_summary_chunk(dict_sttc_mi, list_signals, period = 10, n_std = 1)
df_sttc_mi_ccf = patients_dict_ccf_chunk(dict_sttc_mi, all_combinaciones, max_lag=25)
sttc_mi_cff_features = get_ccf_summary_chunk(df_sttc_mi_ccf, dict_combinaciones, proportion_to_cut = 0.05)

In [15]:
df_sttc_mi_features = (
    df_sttc_mi_acf_pacf
    .merge(df_sttc_mi_peak, on=["patient_id", "label"], how="inner")
    .merge(sttc_mi_cff_features, on=["patient_id", "label"], how="inner")
)

cols = ['patient_id', 'label', 'chunk'] + [col for col in df_sttc_mi_features.columns if col not in ('patient_id', 'label', 'chunk')]

df_sttc_mi_features = df_sttc_mi_features[cols]

In [16]:
del dict_sttc_mi, df_sttc_mi_acf_pacf, df_sttc_mi_peak, df_sttc_mi_ccf

In [27]:
df_sttc_mi_features.to_csv("output/chunk_data/chunk_100/features/sttc_mi_features.csv", index=False)


In [18]:
dict_sttc = {patient: split_dataframe_into_chunks(df_sttc[patient], 100) for patient in df_sttc.keys()}
df_sttc_acf_pacf = genera_df_acf_pacf_chunk(dict_sttc, list_signals, apply_diff=True)
df_sttc_peak = get_dict_serie_summary_chunk(dict_sttc, list_signals, period = 10, n_std = 1)
df_sttc_ccf = patients_dict_ccf_chunk(dict_sttc, all_combinaciones, max_lag=25)
sttc_cff_features = get_ccf_summary_chunk(df_sttc_ccf, dict_combinaciones, proportion_to_cut = 0.05)

In [19]:
df_sttc_features = (
    df_sttc_acf_pacf
    .merge(df_sttc_peak, on=["patient_id", "label"], how="inner")
    .merge(sttc_cff_features, on=["patient_id", "label"], how="inner")
)

cols = ['patient_id', 'label', 'chunk'] + [col for col in df_sttc_features.columns if col not in ('patient_id', 'label', 'chunk')]

df_sttc_features = df_sttc_features[cols]

In [20]:
df_sttc_features.to_csv("output/chunk_data/chunk_100/features/sttc_features.csv", index=False)

In [21]:
del dict_sttc, df_sttc_acf_pacf, df_sttc_peak, sttc_cff_features

In [22]:
dict_other = {patient: split_dataframe_into_chunks(df_other[patient], 100) for patient in df_other.keys()}
df_other_acf_pacf = genera_df_acf_pacf_chunk(dict_other, list_signals, apply_diff=True)
df_other_peak = get_dict_serie_summary_chunk(dict_other, list_signals, period = 10, n_std = 1)
df_other_ccf = patients_dict_ccf_chunk(dict_other, all_combinaciones, max_lag=25)
other_cff_features = get_ccf_summary_chunk(df_other_ccf, dict_combinaciones, proportion_to_cut = 0.05)

In [23]:
df_other_features = (
    df_other_acf_pacf
    .merge(df_other_peak, on=["patient_id", "label"], how="inner")
    .merge(other_cff_features, on=["patient_id", "label"], how="inner")
)

cols = ['patient_id', 'label', 'chunk'] + [col for col in df_other_features.columns if col not in ('patient_id', 'label', 'chunk')]

df_other_features = df_other_features[cols]

In [24]:
df_other_features.to_csv("output/chunk_data/chunk_100/features/other_features.csv", index=False)

In [25]:
del dict_other, df_other_acf_pacf, df_other_peak, other_cff_features

In [8]:
df_mi_features = pd.read_csv("output/chunk_data/chunk_100/features/mi_features.csv")
df_sttc_mi_features = pd.read_csv("output/chunk_data/chunk_100/features/sttc_mi_features.csv")
df_sttc_features = pd.read_csv("output/chunk_data/chunk_100/features/sttc_features.csv")
df_other_features = pd.read_csv("output/chunk_data/chunk_100/features/other_features.csv")

In [9]:
patients = pd.read_csv("output/split_train_test/patients_90_05_05.csv").rename(columns={"patient": "patient_id", "class": "label"})

In [10]:
patients_train = patients[(patients["sample"] == "train")][["patient_id", "label"]]
patients_test = patients[(patients["sample"] == "test")][["patient_id", "label"]]
patients_val = patients[(patients["sample"] == "validation")][["patient_id", "label"]]

In [11]:
df_features = pd.concat([df_mi_features, df_sttc_mi_features, df_sttc_features, df_other_features], axis=0)

In [12]:
df_features.to_csv("output/chunk_data/chunk_100/features/all_features.csv", index=False)

In [None]:
df_features = pd.read_csv("output/chunk_data/chunk_100/features/all_features.csv")

In [13]:
features_train = df_features.merge(patients_train, on=["patient_id", "label"], how="inner")

: 

In [None]:
features_train.to_csv("output/chunk_data/chunk_100/features/all_features.csv", index=False)

In [30]:
patients_train_mi = patients_train[patients_train["class"] == "mi"]["patient"]
patients_train_sttc_mi = patients_train[patients_train["class"] == "sttc_mi"]["patient"]
patients_train_sttc = patients_train[patients_train["class"] == "sttc"]["patient"]
patients_train_other = patients_train[patients_train["class"] == "other"]["patient"]

patients_test_mi = patients_test[patients_test["class"] == "mi"]["patient"]
patients_test_sttc_mi = patients_test[patients_test["class"] == "sttc_mi"]["patient"]
patients_test_sttc = patients_test[patients_test["class"] == "sttc"]["patient"]
patients_test_other = patients_test[patients_test["class"] == "other"]["patient"]

patients_val_mi = patients_val[patients_val["class"] == "mi"]["patient"]
patients_val_sttc_mi = patients_val[patients_val["class"] == "sttc_mi"]["patient"]
patients_val_sttc = patients_val[patients_val["class"] == "sttc"]["patient"]
patients_val_other = patients_val[patients_val["class"] == "other"]["patient"]



In [31]:
df_mi_train = df_mi_features[df_mi_features["patient_id"].isin(patients_train_mi)]
df_sttc_mi_train = df_sttc_mi_features[df_sttc_mi_features["patient_id"].isin(patients_train_sttc_mi)]
df_sttc_train = df_sttc_features[df_sttc_features["patient_id"].isin(patients_train_sttc)]
df_other_train = df_other_features[df_other_features["patient_id"].isin(patients_train_other)]

df_mi_test = df_mi_features[df_mi_features["patient_id"].isin(patients_test_mi)]
df_sttc_mi_test = df_sttc_mi_features[df_sttc_mi_features["patient_id"].isin(patients_test_sttc_mi)]
df_sttc_test = df_sttc_features[df_sttc_features["patient_id"].isin(patients_test_sttc)]
df_other_test = df_other_features[df_other_features["patient_id"].isin(patients_test_other)]

df_mi_val = df_mi_features[df_mi_features["patient_id"].isin(patients_val_mi)]
df_sttc_mi_val = df_sttc_mi_features[df_sttc_mi_features["patient_id"].isin(patients_val_sttc_mi)]
df_sttc_val = df_sttc_features[df_sttc_features["patient_id"].isin(patients_val_sttc)]
df_other_val = df_other_features[df_other_features["patient_id"].isin(patients_val_other)]

In [32]:
df_mi_train

Unnamed: 0,patient_id,label,chunk,acf_I_lag_1,acf_I_lag_2,acf_I_lag_3,acf_I_lag_4,acf_I_lag_5,pacf_I_lag_1,pacf_I_lag_2,...,n_cruces_cero_V5_V6,promedio_V5_V6,std_V5_V6,max_V5_V6,min_V5_V6,maxlag_V5_V6,minlag_V5_V6,kurtosis_V5_V6,trim_mean_V5_V6,norm_ccf
0,patient_2415,mi,chunk_1,0.331863,-0.415509,-0.444558,-0.117246,0.195589,0.335249,-0.604397,...,2,-0.023925,0.230732,0.963066,-0.184184,0,15,8.965145,-0.056371,0.585239
1,patient_2415,mi,chunk_2,0.331863,-0.415509,-0.444558,-0.117246,0.195589,0.335249,-0.604397,...,5,-0.056055,0.232134,0.955720,-0.233830,0,19,8.501799,-0.087853,0.756792
2,patient_2415,mi,chunk_3,0.331863,-0.415509,-0.444558,-0.117246,0.195589,0.335249,-0.604397,...,2,-0.075253,0.233178,0.871191,-0.257949,0,-13,7.329390,-0.105983,0.655799
3,patient_2415,mi,chunk_4,0.331863,-0.415509,-0.444558,-0.117246,0.195589,0.335249,-0.604397,...,4,-0.124307,0.145294,0.400326,-0.255581,0,-10,3.055412,-0.139548,1.034301
4,patient_2415,mi,chunk_5,0.331863,-0.415509,-0.444558,-0.117246,0.195589,0.335249,-0.604397,...,3,-0.042716,0.187405,0.712868,-0.227489,0,14,6.076936,-0.064704,0.708712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,patient_62,mi,chunk_6,0.444429,-0.331044,-0.481277,-0.218304,0.030312,0.448964,-0.675623,...,4,0.023826,0.216208,0.971467,-0.125345,0,25,9.713665,-0.007560,1.039202
599996,patient_62,mi,chunk_7,0.444429,-0.331044,-0.481277,-0.218304,0.030312,0.448964,-0.675623,...,4,0.026536,0.217220,0.961697,-0.126706,0,25,9.393774,-0.004753,0.960889
599997,patient_62,mi,chunk_8,0.444429,-0.331044,-0.481277,-0.218304,0.030312,0.448964,-0.675623,...,4,0.027458,0.215832,0.954627,-0.114620,0,25,9.423916,-0.004101,0.999539
599998,patient_62,mi,chunk_9,0.444429,-0.331044,-0.481277,-0.218304,0.030312,0.448964,-0.675623,...,4,0.029441,0.215048,0.953271,-0.114024,0,21,9.433273,-0.001966,0.880451


In [14]:
patients_mi = pd.DataFrame({'patient_id' : df_mi.keys(),
             'class' : "mi",
             "class_id" : 0})
patients_sttc_mi = pd.DataFrame({'patient_id' : df_sttc_mi.keys(),
             'class' : "sttc_mi",
             "class_id" : 1})
patients_sttc = pd.DataFrame({'patient_id' : df_sttc.keys(),
             'class' : "sttc",
             "class_id" : 2})
patients_other = pd.DataFrame({'patient_id' : df_other.keys(),
             'class' : "other",
             "class_id" : 3})

patients_classes = pd.concat([patients_mi, patients_sttc_mi, patients_sttc, patients_other], axis=0)

## Split Train Test Val

In [45]:
df_mi_peak_val.to_csv("output/features/val/mi_peak.csv", index = False)
df_sttc_mi_peak_val.to_csv("output/features/val/sttc_mi_peak.csv", index = False)
df_sttc_peak_val.to_csv("output/features/val/sttc_peak.csv", index = False)
df_other_peak_val.to_csv("output/features/val/other_peak.csv", index = False)

#### Cross Correlation features

Correlaciones cruzadas
* Promedio de correlaciones por combinación.
* Desviación estándar de las correlaciones por combinación.
* Máxima y mínima correlación por combinación.
* Lag de la máxima correlación por combinación.
* Kurtosis.
* Promedio de todas las autocorrelaciones de la matriz.
* Norma de la matriz.
* Número de cruces por cero.

In [46]:
df_mi_ccf_val = patients_dict_ccf(df_mi_val, all_combinaciones)
df_sttc_mi_ccf_val = patients_dict_ccf(df_sttc_mi_val, all_combinaciones)
df_sttc_ccf_val = patients_dict_ccf(df_sttc_val, all_combinaciones)
df_other_ccf_val = patients_dict_ccf(df_other_val, all_combinaciones)

In [None]:
df_mi_ccf_stats_val = get_dict_ccf_summary(df_mi_ccf_val, dict_combinaciones,)
df_sttc_mi_ccf_stats_val = get_dict_ccf_summary(df_sttc_mi_ccf_val, dict_combinaciones,)
df_sttc_ccf_stats_val = get_dict_ccf_summary(df_sttc_ccf_val, dict_combinaciones,)
df_other_ccf_stats_val = get_dict_ccf_summary(df_other_ccf_val, dict_combinaciones,)

In [None]:
df_mi_ccf_stats_val.to_csv("output/features/val/mi_ccf_stats.csv", index = False)
df_sttc_mi_ccf_stats_val.to_csv("output/features/val/sttc_mi_ccf_stats.csv", index = False)
df_sttc_ccf_stats_val.to_csv("output/features/val/sttc_ccf_stats.csv", index = False)
df_other_ccf_stats_val.to_csv("output/features/val/other_ccf_stats.csv", index = False)