In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [3]:
df_diabetes = pd.read_csv("../../Semana2/data/diabetes.csv")
df_diabetes.head(10)

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,6,148,72.0,35.0,0.0,33.6,0.627,50,1
1,1,1,85,66.0,29.0,0.0,26.6,0.351,31,0
2,2,8,183,64.0,0.0,0.0,23.3,0.672,32,1
3,3,1,89,66.0,23.0,94.0,28.1,0.167,21,0
4,4,0,137,40.0,35.0,168.0,43.1,2.288,33,1
5,5,5,116,74.0,0.0,0.0,25.6,0.201,30,0
6,6,3,78,50.0,32.0,88.0,31.0,0.248,26,1
7,7,10,115,,,,35.3,0.134,29,0
8,8,2,197,70.0,45.0,543.0,30.5,0.158,53,1
9,9,8,125,96.0,,,,0.232,54,1


In [4]:
df_diabetes.drop("Unnamed: 0", axis=1, inplace=True)

In [5]:
df_diabetes[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = df_diabetes[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.NaN)

In [6]:
train_set, test_set = train_test_split(df_diabetes, test_size=0.2, random_state=42, shuffle=True, stratify=df_diabetes['Outcome'])

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.impute import SimpleImputer

class ConditionalImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy="mean", condition=None):
        self.strategy = strategy
        self.atributos = []
        self.condition = condition

    def fit(self, X, y=None):
        self.atributos = X.columns
        if not self.condition:
            self.imputer_ = SimpleImputer(strategy=self.strategy)
            self.imputer_.fit(X)
        else:
            self.target_ = X[self.condition].unique()
            if self.strategy == 'median':
                self.median_ = X.groupby([self.condition]).median().reset_index()
                
            elif self.strategy == 'mean':
                self.mean_ = X.groupby([self.condition]).mean().reset_index()
        return self
    
    def transform(self, X):
        check_is_fitted(self)
        X_copy = X.copy()
        if not self.condition:
            return self.imputer_.transform(X_copy)
        else:
            for target in self.target_:
                for atributo in self.atributos:
                    if self.strategy == 'median':
                        X_copy.loc[ (X_copy[self.condition]==target) & (X_copy[atributo].isna()), atributo ] = \
                        self.median_[atributo][target]
                    elif self.strategy == 'mean':
                        X_copy.loc[ (X_copy[self.condition]==target) & (X_copy[atributo].isna()), atributo ] = \
                        self.mean_[atributo][target]
            X_copy.drop(self.condition, axis=1, inplace=True)
            return X_copy


In [8]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    [
        ('imputer', ConditionalImputer(strategy="median", condition="Outcome")),
        ('std_scaler', StandardScaler()),
    ]
)

In [9]:
diabetes_train_final = pipeline.fit_transform(train_set)
diabetes_test_final = pipeline.fit_transform(test_set)
df_train_final = pd.DataFrame(diabetes_train_final, columns=train_set.drop("Outcome", axis=1).columns)
df_test_final = pd.DataFrame(diabetes_test_final, columns=test_set.drop("Outcome", axis=1).columns)

# Clasificacion con regresión logística

## Con una variable

In [10]:
y = train_set["Outcome"]
X = df_train_final["Glucose"]
lrgd = LogisticRegression(max_iter=1000, random_state=1)
lrgd = lrgd.fit(X.to_numpy().reshape(-1, 1) , y)

## Con dos variables

In [11]:
y_bi = train_set["Outcome"]
X_bi = pd.concat([df_train_final["Glucose"], df_train_final["BMI"]], axis=1)
lrgd_bi = LogisticRegression(max_iter=1000, random_state=1)
lrgd_bi = lrgd_bi.fit(X_bi, y_bi)

## Multivariada

In [12]:
y_multi = train_set["Outcome"]
X_multi = df_train_final
lrgd_multi = LogisticRegression(max_iter=1000, random_state=1)
lrgd_multi = lrgd_multi.fit(X_multi, y_multi)

### Medida de desempeño

In [13]:
print("Con set de entrenamiento:\n\tUnivariado:", lrgd.score(X.to_numpy().reshape(-1, 1), y))
print("\tBivariado:", lrgd_bi.score(X_bi, y_bi))
print("\tMultivariado:", lrgd_multi.score(X_multi, y_multi))

print("\n\nCon set de prueba:\n\tUnivariado:", lrgd.score(df_test_final["Glucose"].to_numpy().reshape(-1, 1), test_set["Outcome"]))
X_bi_test = pd.concat([df_test_final["Glucose"], df_test_final["BMI"]], axis=1)
print("\tBivariado:", lrgd_bi.score(X_bi_test, test_set["Outcome"]))
print("\tMultivariado:", lrgd_multi.score(df_test_final, test_set["Outcome"]))

Con set de entrenamiento:
	Univariado: 0.758957654723127
	Bivariado: 0.7833876221498371
	Multivariado: 0.7947882736156352


Con set de prueba:
	Univariado: 0.7077922077922078
	Bivariado: 0.7142857142857143
	Multivariado: 0.7142857142857143


# Clasificación con KNN

In [14]:
target_val = test_set["Outcome"]

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

parametrosKNN = {'n_neighbors': list(np.arange(1, 10))}
knn = KNeighborsClassifier()
gridKNN = GridSearchCV(knn, parametrosKNN)


In [16]:
gridKNN.fit(df_train_final, train_set["Outcome"])

In [17]:
gridKNN.best_params_

{'n_neighbors': 7}

In [18]:
mejorKNN = KNeighborsClassifier(n_neighbors=7)
mejorKNN.fit(df_train_final, train_set["Outcome"])

In [19]:
from sklearn.metrics import r2_score
print(f"R2 score = {r2_score(mejorKNN.predict(df_test_final), target_val)}")

R2 score = -0.09887869520897019


# Clasificación con SVM

In [20]:
from sklearn.svm import SVC


cs = np.linspace(0.1, 5, 20)
gammas = np.linspace(0.1, 5, 20)
parametros = {'kernel':('linear', 'rbf'), 'C':cs, 'gamma':gammas}

svm = SVC()
grid = GridSearchCV(svm, parametros)
grid.fit(df_train_final, train_set["Outcome"])

In [21]:
grid.best_params_

{'C': 2.421052631578948, 'gamma': 0.1, 'kernel': 'rbf'}

In [22]:
mejorSVM = SVC(C=grid.best_params_["C"], gamma=grid.best_params_["gamma"], kernel=grid.best_params_["kernel"])
mejorSVM.fit(df_train_final, train_set["Outcome"])
print(f"R2 score con test = {r2_score(mejorSVM.predict(df_test_final), target_val)}")
train_target = train_set["Outcome"]
print(f"R2 score con train = {r2_score(mejorSVM.predict(df_train_final), train_target)}")

R2 score con test = 0.13197278911564636
R2 score con train = 0.6073796892762411


# Clasificación con Perceptron

In [43]:
from sklearn.linear_model import Perceptron

iteraciones = np.arange(20, 300, 10)
etas = np.linspace(0.01, 1, 10)
randomstates = np.arange(1, 20, 3)
shuffle = [True, False]
paramPerc = {'max_iter' : iteraciones, 'eta0' : etas, 'random_state' : randomstates, 'shuffle' : shuffle}

perceptron = Perceptron()
gridPerc = GridSearchCV(perceptron, paramPerc)

In [44]:
gridPerc.fit(df_train_final, train_set["Outcome"])

In [45]:
gridPerc.best_params_

{'eta0': 0.01, 'max_iter': 20, 'random_state': 16, 'shuffle': True}

In [46]:
mejorPerc = Perceptron(eta0=gridPerc.best_params_["eta0"], max_iter=gridPerc.best_params_["max_iter"], random_state=gridPerc.best_params_["random_state"], shuffle=gridPerc.best_params_["shuffle"])
mejorPerc.fit(df_train_final, train_set["Outcome"])

# Cálculo de los distintos Accuracy


In [47]:
from sklearn.metrics import accuracy_score

nomClasificador = []
accuracy = []

clasificadores = [mejorKNN, mejorSVM, mejorPerc]

for clasificador in clasificadores:
    y_pred = clasificador.predict(df_test_final)
    accuracy.append(accuracy_score(target_val, y_pred))
    nomClasificador.append(str(clasificador))

In [48]:
pd.DataFrame({"Clasificador" : nomClasificador, "Accuracy" : accuracy})

Unnamed: 0,Clasificador,Accuracy
0,KNeighborsClassifier(n_neighbors=7),0.772727
1,"SVC(C=2.421052631578948, gamma=0.1)",0.811688
2,"Perceptron(eta0=0.01, max_iter=20, random_stat...",0.753247
