## <a id = '0'> Índice </a>

* [**Entorno**](#1)  
   * [Librerías](#1d1)  
   * [Funciones](#1d2)  
   * [Constantes](#1d3)

* [**Lectura de datos**](#2)


## <a id = '1'> Entorno </a>
[índice](#0)

### <a id = '1d1'> Librerías </a>

In [1]:
import os
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import math
import itertools
# from config import data_folder

In [2]:
os.chdir("../")

### <a id = '1d2'> Funciones </a>

In [3]:
from src.utils import *

### <a id = '1d3'> Constantes </a>

In [4]:
from config.cons import data_folder, window_jason, cara_lateral, cara_anterior, cara_interior, cara_sepal, list_signals

## <a id = '2'> Lectura de datos </a>
[índice](#0)

In [5]:
proyect_path = os.getcwd()

In [6]:
df_train = pd.read_csv("output/features/pre_model/train_features.csv")
df_test = pd.read_csv("output/features/pre_model/test_features.csv")
df_val = pd.read_csv("output/features/pre_model/val_features.csv")

In [7]:
train_keys = df_train[["patient", "label"]]
train_featues = df_train.drop(columns=["patient", "label"])
test_keys = df_test[["patient", "label"]]
test_featues = df_test.drop(columns=["patient", "label"])
val_keys = df_val[["patient", "label"]]
val_featues = df_val.drop(columns=["patient", "label"])

In [8]:
nulos_col = df_train.isnull().sum()
nulos_col[nulos_col > 0]

mean_peaks_AVL    4
std_peaks_AVL     4
mean_peaks_V3     3
std_peaks_V3      3
mean_peaks_V1     7
std_peaks_V1      7
mean_peaks_V2     3
std_peaks_V2      3
mean_peaks_II     3
std_peaks_II      3
mean_peaks_V4     1
std_peaks_V4      1
mean_peaks_V5     4
std_peaks_V5      4
mean_peaks_V6     5
std_peaks_V6      5
mean_peaks_III    3
std_peaks_III     3
mean_peaks_AVR    2
std_peaks_AVR     2
mean_peaks_AVF    4
std_peaks_AVF     4
mean_peaks_I      1
std_peaks_I       1
dtype: int64

### PCA

In [9]:
pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95))
])

In [10]:
pipeline.fit(train_featues)

In [11]:
pca = pipeline.named_steps["pca"]

In [12]:
pca.explained_variance_ratio_.cumsum()

array([0.10598515, 0.16773018, 0.21069558, 0.2431722 , 0.27271318,
       0.29966333, 0.32149023, 0.34251467, 0.36217935, 0.38035898,
       0.39681505, 0.41169285, 0.42537798, 0.43863083, 0.45164088,
       0.46345842, 0.47450196, 0.48534606, 0.49574138, 0.50606249,
       0.51512489, 0.52356867, 0.53193557, 0.53939106, 0.54664301,
       0.55347551, 0.55978988, 0.5659286 , 0.57194729, 0.57772831,
       0.58346763, 0.58908704, 0.59442783, 0.59958111, 0.60453504,
       0.60929639, 0.61400979, 0.61856255, 0.62302382, 0.62740516,
       0.63166332, 0.63581144, 0.63989447, 0.64391433, 0.64791275,
       0.65181142, 0.65565119, 0.65939524, 0.66306164, 0.66665987,
       0.67018019, 0.67367133, 0.67711953, 0.6805154 , 0.68381953,
       0.68704484, 0.69023082, 0.6933677 , 0.69643566, 0.69947982,
       0.70244198, 0.70537531, 0.70830579, 0.7111456 , 0.71396014,
       0.7167084 , 0.71941461, 0.72209171, 0.72472457, 0.72727328,
       0.7298145 , 0.73234449, 0.73484594, 0.73728966, 0.73968

In [13]:
pca.n_components_

285

In [14]:
pca.components_

array([[-0.02385167, -0.03072939, -0.00656754, ...,  0.05843133,
         0.00254016, -0.04014868],
       [-0.02469892, -0.02670318,  0.00311357, ..., -0.00916087,
        -0.05180924,  0.10339042],
       [ 0.09078885,  0.09676345, -0.0121031 , ...,  0.05100942,
        -0.03403788,  0.01732615],
       ...,
       [ 0.00920877,  0.0090221 ,  0.00991575, ...,  0.07297894,
        -0.00481444, -0.00571308],
       [ 0.02245378,  0.00249235, -0.01952136, ...,  0.05318814,
        -0.00792577, -0.00645957],
       [ 0.01668788,  0.01486754,  0.0075636 , ...,  0.02162064,
         0.01546996,  0.00531887]])

In [15]:
train_featues_pca = pipeline.transform(train_featues)
test_featues_pca = pipeline.transform(test_featues)
val_featues_pca = pipeline.transform(val_featues)

In [16]:
train_featues_pca = pd.DataFrame(train_featues_pca)
test_featues_pca = pd.DataFrame(test_featues_pca)
val_featues_pca = pd.DataFrame(val_featues_pca)
train_featues_pca.columns = [f"PC{i}" for i in range(1, train_featues_pca.shape[1] + 1)]
test_featues_pca.columns = [f"PC{i}" for i in range(1, test_featues_pca.shape[1] + 1)]
val_featues_pca.columns = [f"PC{i}" for i in range(1, val_featues_pca.shape[1] + 1)]
train_featues_pca["patient"] = df_train["patient"]
train_featues_pca["label"] = df_train["label"]
test_featues_pca["patient"] = df_test["patient"]
test_featues_pca["label"] = df_test["label"]
val_featues_pca["patient"] = df_val["patient"]
val_featues_pca["label"] = df_val["label"]
train_featues_pca.to_csv("output/features/transform/train_features_pca.csv", index=False)
test_featues_pca.to_csv("output/features/transform/test_features_pca.csv", index=False)
val_featues_pca.to_csv("output/features/transform/val_features_pca.csv", index=False)