In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

Data upload

In [4]:
df = pd.read_excel('data/CTG.xls', sheet_name='Raw Data', header=0, skiprows=[1])
data = df.to_numpy()

In [5]:
data

array([[120., 120.,   0., ...,  73.,   1.,   2.],
       [132., 132.,   4., ...,  12.,   0.,   1.],
       [133., 133.,   2., ...,  13.,   0.,   1.],
       ...,
       [140., 140.,   1., ...,   4.,   1.,   2.],
       [140., 140.,   1., ...,   4.,   1.,   2.],
       [142., 142.,   1., ...,   1.,   0.,   1.]])

In [6]:
# Features matrix
X = data[:,0:22]
Y = data[:,23]

Training samples

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, stratify=Y, shuffle=True)

In [8]:
smt = SMOTE()
X_smt, Y_smt = smt.fit_resample(X_train, y_train)
X_smt.shape

(3723, 22)

In [9]:
(unique, counts) = np.unique(y_train, return_counts=True)
frqs = np.asarray((unique, counts)).T
print('original data: \n', frqs)

original data: 
 [[1.000e+00 1.241e+03]
 [2.000e+00 2.210e+02]
 [3.000e+00 1.320e+02]]


In [10]:
(unique, counts) = np.unique(Y_smt, return_counts=True)
frqs = np.asarray((unique, counts)).T
print('smooted data: \n', frqs)

smooted data: 
 [[1.000e+00 1.241e+03]
 [2.000e+00 1.241e+03]
 [3.000e+00 1.241e+03]]


In [11]:
(unique, counts) = np.unique(y_test, return_counts=True)
frqs = np.asarray((unique, counts)).T
print('test data: \n', frqs)

test data: 
 [[  1. 414.]
 [  2.  74.]
 [  3.  44.]]


Models' validation

Cuadratic discriminant analysis

In [74]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from  sklearn.metrics import accuracy_score
import time

In [79]:
def train_quad_anal(n_comp, X, Y):

    #Implemetamos la metodología de validación 
    Errores = np.ones(5)
    times = np.ones(5)
    j = 0
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(X):  
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        
        # ¿es necesario estandarizacion de datos?
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        Y_train= scaler.fit_transform(y_train)
        print("X_train", X_train)
        X_test = scaler.transform(X_test)
        # dejar el nombre del objeto igual (lda)
        qda = QuadraticDiscriminantAnalysis()
        # para calcular costo computacional
        tiempo_i = time.time()
        # es recomendable usar el metodo que ajusta y transforma
        X_train_qda = qda.fit(X_train, y_train)
        # aca solo usar el metodo de transformar (ya que en el anterior el pca se ajusto)
        X_test_qda= qda.fit(X=X_test,  y=y_test)
        # entrenar el modelo usando las caractieristicas transformadas por PCA

        tiempo_o = time.time()-tiempo_i
        #print("X_train_qda= ", X_train_qda, "\n", "X_test_qda", X_test_qda)
        # Errores[j] = accuracy_score(y_true=y_test, y_pred=qda.predict(X_test_qda))
        times[j] = tiempo_o
        j+=1


    return np.mean(Errores), np.std(Errores), np.mean(times)

Parzen window