In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Cargo el dataset

dataset = pd.read_csv('pima-indians-diabetes.csv', encoding = "ISO-8859-1", delimiter=',', header=0,
                     names= ['pregnant','glucose','pressure','triceps','insulin','mass',
                            'pedigree','age','diabetes'])  

dataset


In [None]:
#Tomamos las variable edad y glucosa en plasma para clasificar diabetes

#Importamos las librerias

from sklearn.model_selection import train_test_split
from sklearn import linear_model
import sklearn.metrics as sm

In [None]:
#Graficamos las variables independientes

legend = dataset['diabetes'].unique()

plt.figure(figsize=(8,8))
plt.scatter(dataset['glucose'], dataset['age'], c= dataset['diabetes'],label= legend)
plt.xlabel('Glucose')
plt.ylabel('Age')
plt.legend()
plt.show()

In [None]:
#Dividimos el dataset en train, test

X_train,X_test,y_train, y_test = train_test_split(
dataset[['glucose','age']],dataset['diabetes'], random_state=123, test_size= 0.2
)

print(X_train)
print(X_test)
print(y_train)
print(y_test)

## Clasificación Modelo Logistico

In [None]:
#Instanciamos el modelo logistico y lo entrenamos

clasificador = linear_model.LogisticRegression()

#Entrenamos el modelo

clasificador.fit(X_train, y_train)

In [None]:
#Predecimos los valores

pred = clasificador.predict(X_test)


In [None]:
#Graficamos los valores de test y los predichos

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].

X = np.array(X_test)


x_min, x_max = X[:, 0].min() - 0.5,  X[:, 0].max() + 0.5
y_min, y_max =  X[:, 1].min() - 0.5,  X[:, 1].max() + 0.5

h = 0.1  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clasificador.predict(np.c_[xx.ravel(), yy.ravel()])

# # Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8,8))
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap = "PuRd", shading ='auto')

# # Plot also the training points
plt.scatter(X_train['glucose'],X_train['age'], c=y_train, edgecolors="k", cmap="PuRd",label='Training Points')
# Plot also the testing points
plt.scatter(X_test['glucose'],X_test['age'], c=pred, edgecolors="k", cmap="PuRd",marker="^",label='Test Points')
plt.legend(loc="upper left")
plt.xlabel("Glucose")
plt.ylabel("Age")
plt.show()


In [None]:
#Obtenemos las métricas de error del modelo
#Traemos accuracy y roc scored
from sklearn.metrics import roc_auc_score

accuracy = clasificador.score(X_test,y_test)
roc= roc_auc_score(y_test, pred)

print("El accuracy del modelo es =", round(accuracy, 2)) 
print("El área bajo la curva =", round(roc, 2))


In [None]:
#Optimizamos el error mediante cross_validation

from sklearn.model_selection import KFold

X= np.array(X_train)

# Instancio el iterador
kf = KFold(n_splits=5, random_state=None, shuffle=False)

#Armo una lista para guardar las metricas

accuracy = []
roc = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    x_train, x_test = dataset.loc[train_index,['glucose','age']], dataset.loc[test_index,['glucose','age']]
    y_train, y_test = dataset.loc[train_index,['diabetes']], dataset.loc[test_index,['diabetes']]
    clasificador.fit(x_train, y_train)
    accuracy.append(clasificador.score(x_test, y_test))
    roc.append(roc_auc_score(y_test,clasificador.predict(x_test)))

In [None]:
accuracy_mean = sum(accuracy)/len(accuracy)
print("El accuracy promedio es: ", round(accuracy_mean,2))

roc_mean =sum(roc)/len(roc)
print("El area bajo la curva promedio es: ", round(roc_mean,2))

## Modelo Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

#Instanciamos el modelo
tree = DecisionTreeClassifier(max_depth=3)

#Dividimos el dataset

X_train,X_test,y_train, y_test = train_test_split(
dataset[['glucose','age']],dataset['diabetes'], random_state=123, test_size= 0.2)

#Entrenamos el árbol de decision

tree.fit(X_train,y_train)

#Generamos las predicciones

tree.predict(X_test)

In [None]:
#Obtenemos las metricas de error por Kfolds


X= np.array(X_train)

# Instancio el iterador
kf = KFold(n_splits=5, random_state=None, shuffle=False)

#Armo una lista para guardar las metricas

accuracy = []
roc = []

for i, (train_index, test_index) in enumerate(kf.split(X)):
    x_train, x_test = dataset.loc[train_index,['glucose','age']], dataset.loc[test_index,['glucose','age']]
    y_train, y_test = dataset.loc[train_index,['diabetes']], dataset.loc[test_index,['diabetes']]
    tree.fit(x_train, y_train)
    accuracy.append(clasificador.score(x_test, y_test))
    roc.append(roc_auc_score(y_test,tree.predict(x_test)))


In [None]:
accuracy_mean = sum(accuracy)/len(accuracy)
print("El accuracy promedio es: ", round(accuracy_mean,2))

roc_mean =sum(roc)/len(roc)
print("El area bajo la curva promedio es: ", round(roc_mean,2))

In [None]:
# Hiperoptimizacion del max_depth

param_grid = { 
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [None]:
from sklearn.model_selection import GridSearchCV #grid search

#Modelo

arbol = DecisionTreeClassifier()

CV_dt = GridSearchCV(estimator=arbol, param_grid=param_grid, cv= 5)


CV_dt.fit(x_train, y_train)

In [None]:
#Buscamos el mejor estimador

CV_dt.best_estimator_

In [None]:
#Vemos los errores

resultados = pd.DataFrame(CV_dt.cv_results_)
resultados.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False)

In [None]:
#Hacemos la estimacion con los parametros que dieron mejor performance

model = DecisionTreeClassifier(criterion='entropy', max_depth=4)

model.fit(x_train,y_train)

model.predict(x_test)


In [None]:
#Obtenemos las metricas de performance una vez optimizado

accuracy_opt = (model.score(x_test, y_test))
roc_opt = roc_auc_score(y_test,model.predict(x_test))

print("Una vez optimizado el modelo tiene un accuracy de:",round(accuracy_opt,2))
print("Una vez optimizado el modelo el area bajo la curva es:",round(roc_opt,2))