In [115]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [116]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [117]:
df_diabetes = pd.read_csv("/content/drive/MyDrive/Semana4-KNNySVM/data/diabetes.csv")
df_diabetes.head(10)

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,6,148,72.0,35.0,0.0,33.6,0.627,50,1
1,1,1,85,66.0,29.0,0.0,26.6,0.351,31,0
2,2,8,183,64.0,0.0,0.0,23.3,0.672,32,1
3,3,1,89,66.0,23.0,94.0,28.1,0.167,21,0
4,4,0,137,40.0,35.0,168.0,43.1,2.288,33,1
5,5,5,116,74.0,0.0,0.0,25.6,0.201,30,0
6,6,3,78,50.0,32.0,88.0,31.0,0.248,26,1
7,7,10,115,,,,35.3,0.134,29,0
8,8,2,197,70.0,45.0,543.0,30.5,0.158,53,1
9,9,8,125,96.0,,,,0.232,54,1


In [118]:
df_diabetes.corr()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Unnamed: 0,1.0,-0.037201,0.012994,0.013177,-0.015267,-0.005718,0.003421,-0.040326,0.007714,-0.045184
Pregnancies,-0.037201,1.0,0.129459,0.151388,-0.061487,-0.068015,0.024504,-0.033523,0.544341,0.221898
Glucose,0.012994,0.129459,1.0,0.153012,0.062008,0.333955,0.224176,0.137337,0.263514,0.466581
BloodPressure,0.013177,0.151388,0.153012,1.0,0.216862,0.089877,0.297168,0.036845,0.239872,0.062199
SkinThickness,-0.015267,-0.061487,0.062008,0.216862,1.0,0.429934,0.399942,0.183921,-0.10626,0.083108
Insulin,-0.005718,-0.068015,0.333955,0.089877,0.429934,1.0,0.197989,0.187177,-0.039496,0.131699
BMI,0.003421,0.024504,0.224176,0.297168,0.399942,0.197989,1.0,0.138354,0.046185,0.303572
DiabetesPedigreeFunction,-0.040326,-0.033523,0.137337,0.036845,0.183921,0.187177,0.138354,1.0,0.033561,0.173844
Age,0.007714,0.544341,0.263514,0.239872,-0.10626,-0.039496,0.046185,0.033561,1.0,0.238356
Outcome,-0.045184,0.221898,0.466581,0.062199,0.083108,0.131699,0.303572,0.173844,0.238356,1.0


In [119]:
df_diabetes.drop("Unnamed: 0", axis=1, inplace=True)

In [120]:
df_diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             767 non-null    float64
 3   SkinThickness             756 non-null    float64
 4   Insulin                   763 non-null    float64
 5   BMI                       767 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(5), int64(4)
memory usage: 54.1 KB


In [121]:
df_diabetes[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = df_diabetes[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.NaN)

In [122]:
df_diabetes.isna().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

## Separar los datos en entrenamiento y prueba

In [123]:
train_set, test_set = train_test_split(df_diabetes, test_size=0.2, random_state=42, shuffle=True, stratify=df_diabetes['Outcome'])

### Utilice la siguiente clase para crear un pipeline que permita imputar un valor (mediana) a los datos faltantes y escalar los datos

In [124]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.impute import SimpleImputer

class ConditionalImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy="mean", condition=None):
        self.strategy = strategy
        self.atributos = []
        self.condition = condition

    def fit(self, X, y=None):
        self.atributos = X.columns
        if not self.condition:
            self.imputer_ = SimpleImputer(strategy=self.strategy)
            self.imputer_.fit(X)
        else:
            self.target_ = X[self.condition].unique()
            if self.strategy == 'median':
                self.median_ = X.groupby([self.condition]).median().reset_index()
                
            elif self.strategy == 'mean':
                self.mean_ = X.groupby([self.condition]).mean().reset_index()
        return self
    
    def transform(self, X):
        check_is_fitted(self)
        X_copy = X.copy()
        if not self.condition:
            return self.imputer_.transform(X_copy)
        else:
            for target in self.target_:
                for atributo in self.atributos:
                    if self.strategy == 'median':
                        X_copy.loc[ (X_copy[self.condition]==target) & (X_copy[atributo].isna()), atributo ] = \
                        self.median_[atributo][target]
                    elif self.strategy == 'mean':
                        X_copy.loc[ (X_copy[self.condition]==target) & (X_copy[atributo].isna()), atributo ] = \
                        self.mean_[atributo][target]
            X_copy.drop(self.condition, axis=1, inplace=True)
            return X_copy

#Devuelve el conjunto de datos con los datos faltantes reemplazados por las medias/medianas de sus respectivas columnas dependiendo de la clase
#y sin la columna correspondiente al target

In [125]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    [
        ('imputer', ConditionalImputer(strategy="median", condition="Outcome")),
        ('std_scaler', StandardScaler()),
    ]
)

In [126]:
diabetes_train_final = pipeline.fit_transform(train_set)
diabetes_test_final = pipeline.fit_transform(test_set)
df_train_final = pd.DataFrame(diabetes_train_final, columns=train_set.drop("Outcome", axis=1).columns)
df_test_final = pd.DataFrame(diabetes_test_final, columns=test_set.drop("Outcome", axis=1).columns)
df_test_final

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.852530,1.166257,-0.834758,-0.279808,-0.405021,-0.714430,-0.465625,0.634010
1,1.690549,-1.665822,2.881150,-0.750743,-0.997512,0.429575,-0.492533,1.245336
2,-0.544168,0.014753,0.226930,-0.279808,0.358036,0.486069,0.099438,-0.588642
3,0.852530,-0.234221,-0.834758,0.308859,0.142585,-0.714430,0.843887,0.110016
4,-0.544168,-1.479091,-0.126966,-0.279808,-0.405021,-0.354280,-1.039658,-0.937972
...,...,...,...,...,...,...,...,...
149,-0.264828,-1.043387,-1.365602,-2.163545,-0.952626,-1.081642,-0.546349,-0.937972
150,-0.544168,-0.856656,0.226930,-1.339410,-0.844900,-0.121242,0.595738,-0.850639
151,-0.544168,-1.230117,-1.896446,-0.868476,-0.405021,-0.559071,3.734980,-0.675975
152,0.014511,1.944301,0.403878,1.132994,1.049276,0.641428,-0.555318,-0.151981


## Regresión Logística con una variable

In [127]:
y = train_set["Outcome"]
X = df_train_final["Glucose"]
lrgd = LogisticRegression(max_iter=1000, random_state=1)
_ = lrgd.fit(X.to_numpy().reshape(-1, 1) , y)

## Regresión Logística con 2 variables

In [128]:
y_bi = train_set["Outcome"]
X_bi = pd.concat([df_train_final["Glucose"], df_train_final["BMI"]], axis=1)
lrgd_bi = LogisticRegression(max_iter=1000, random_state=1)
_ = lrgd_bi.fit(X_bi, y_bi)
X_bi

Unnamed: 0,Glucose,BMI
0,-1.057086,-0.766169
1,0.142577,-0.414469
2,-0.557227,0.362201
3,0.809056,-0.399815
4,-0.890466,1.783654
...,...,...
609,0.409168,-0.517048
610,-1.523621,0.230314
611,-0.823818,0.831134
612,-0.357283,-0.722206


## Regresión Logística Multivariada

In [129]:
y_multi = train_set["Outcome"]
X_multi = df_train_final
lrgd_multi = LogisticRegression(max_iter=1000, random_state=1)
lrgd_multi = lrgd_multi.fit(X_multi, y_multi)

### Medida de desempeño Accuracy 
(Tasa de aciertos)

$$Acc = \frac{aciertos}{N}$$

Utilice el método `score`de [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) para calcular el accuracy en los 3 casos anteriores

In [130]:
print("Con set de entrenamiento:\n\tUnivariado:", lrgd.score(X.to_numpy().reshape(-1, 1), y))
print("\tBivariado:", lrgd_bi.score(X_bi, y_bi))
print("\tMultivariado:", lrgd_multi.score(X_multi, y_multi))

print("\n\nCon set de prueba:\n\tUnivariado:", lrgd.score(df_test_final["Glucose"].to_numpy().reshape(-1, 1), test_set["Outcome"]))
X_bi_test = pd.concat([df_test_final["Glucose"], df_test_final["BMI"]], axis=1)
print("\tBivariado:", lrgd_bi.score(X_bi_test, test_set["Outcome"]))
print("\tMultivariado:", lrgd_multi.score(df_test_final, test_set["Outcome"]))

Con set de entrenamiento:
	Univariado: 0.758957654723127
	Bivariado: 0.7833876221498371
	Multivariado: 0.7947882736156352


Con set de prueba:
	Univariado: 0.7077922077922078
	Bivariado: 0.7142857142857143
	Multivariado: 0.7142857142857143
