# Modelo de Elección Libre: Regresión Logística

## Importación de librerías

In [99]:
import pandas as pd
pd.set_option('display.max_columns', 25) # Número máximo de columnas a mostrar
pd.set_option('display.max_rows', 50) # Numero máximo de filas a mostar
import numpy as np
np.random.seed(3301)
import pandas as pd
# Para preparar los datos
from sklearn.preprocessing import LabelEncoder
# Para crear el arbol de decisión 
from sklearn.tree import DecisionTreeClassifier 
# Para realizar la separación del conjunto de aprendizaje en entrenamiento y test.
from sklearn.model_selection import train_test_split
# Para evaluar el modelo
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import plot_confusion_matrix
# Para búsqueda de hiperparámetros
from sklearn.model_selection import GridSearchCV
# Para la validación cruzada
from sklearn.model_selection import KFold 
#Librerías para la visualización
import matplotlib as mplt
import matplotlib.pyplot as plt
# Seaborn
import seaborn as sns 
from sklearn import tree

#Regresion Lineal
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

#Regresion Logistica
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.weightstats import ttest_ind


## Carga de datos

In [100]:
# Se cargan los datos. 
datos=pd.read_csv('datos.csv', sep=';', encoding = 'utf-8')
datos.head()

Unnamed: 0,HairColor,Pregnancies,Glucose,City,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,Red,6.0,148.0,New York,72.0,35.0,0.0,336,627.0,50,1.0
1,Black,1.0,85.0,New York,66.0,29.0,0.0,266,351.0,31,0.0
2,Red,8.0,183.0,New York,64.0,0.0,0.0,233,672.0,32,1.0
3,Black,1.0,89.0,New York,66.0,23.0,94.0,281,167.0,21,0.0
4,Black,0.0,137.0,New York,40.0,35.0,168.0,431,2288.0,33,1.0


In [101]:
datos.shape

(768, 11)

In [102]:
# Podemos ver los tipos de todas la variables.
datos.dtypes

HairColor                    object
Pregnancies                 float64
Glucose                     float64
City                         object
BloodPressure               float64
SkinThickness               float64
Insulin                     float64
BMI                           int64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                     float64
dtype: object

In [103]:
# Y hacer una descripción de los datos
datos.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,767.0,767.0,767.0,767.0,767.0,768.0,767.0,768.0,767.0
mean,3.839635,120.921773,69.09648,20.563233,79.90352,289.796875,432.395046,38.011719,0.349413
std,3.368429,31.984561,19.366833,15.945349,115.283105,116.757554,336.144934,117.8256,0.477096
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,251.75,205.5,24.0,0.0
50%,3.0,117.0,72.0,23.0,32.0,309.0,337.0,29.0,0.0
75%,6.0,140.5,80.0,32.0,127.5,359.0,592.0,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,671.0,2329.0,3256.0,1.0


In [104]:
# Se observa que hay ausencias:
datos.isnull().sum()

HairColor                   0
Pregnancies                 1
Glucose                     1
City                        1
BloodPressure               1
SkinThickness               1
Insulin                     1
BMI                         0
DiabetesPedigreeFunction    1
Age                         0
Outcome                     1
dtype: int64

### Limpieza

In [105]:
# Es recomendable que todos los pasos de limpieza y preparación se realicen sobre otro archivo.
datoslimpios = datos
datoslimpios

Unnamed: 0,HairColor,Pregnancies,Glucose,City,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,Red,6.0,148.0,New York,72.0,35.0,0.0,336,627.0,50,1.0
1,Black,1.0,85.0,New York,66.0,29.0,0.0,266,351.0,31,0.0
2,Red,8.0,183.0,New York,64.0,0.0,0.0,233,672.0,32,1.0
3,Black,1.0,89.0,New York,66.0,23.0,94.0,281,167.0,21,0.0
4,Black,0.0,137.0,New York,40.0,35.0,168.0,431,2288.0,33,1.0
...,...,...,...,...,...,...,...,...,...,...,...
763,Black,10.0,101.0,New York,76.0,48.0,180.0,329,171.0,63,0.0
764,Black,2.0,122.0,New York,70.0,27.0,0.0,368,34.0,27,0.0
765,Red,5.0,121.0,New York,72.0,23.0,112.0,262,245.0,30,0.0
766,Black,1.0,126.0,New York,60.0,0.0,0.0,301,349.0,47,1.0


In [106]:
# Eliminación registros con ausencias
#¿Porqué realizar este paso?
datoslimpios = datoslimpios.dropna()

In [107]:
# Cantidad de datos y número de variables
datoslimpios.shape

(767, 11)

In [108]:
#Restricciones

#Edad menor a 100 y mayor a 21
datoslimpios = datoslimpios[datoslimpios.Age < 100]
datoslimpios = datoslimpios[datoslimpios.Age > 21]

#BMI no puede ser 0
datoslimpios = datoslimpios[datoslimpios.BMI >= 8]

#Glucosa no puede ser 0
datoslimpios = datoslimpios[datoslimpios.Glucose > 0]

#BloodPressure no puede ser 0
datoslimpios = datoslimpios[datoslimpios.BloodPressure > 0]

#BMI no puede ser 0
# datoslimpios = datoslimpios[datoslimpios.Insulin > 0]

#Quitar color de pelo y ciudad
datoslimpios = datoslimpios.drop(['HairColor'], axis=1)
datoslimpios = datoslimpios.drop(['City'], axis=1)

datoslimpios

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,0.0,336,627.0,50,1.0
1,1.0,85.0,66.0,29.0,0.0,266,351.0,31,0.0
2,8.0,183.0,64.0,0.0,0.0,233,672.0,32,1.0
4,0.0,137.0,40.0,35.0,168.0,431,2288.0,33,1.0
5,5.0,116.0,74.0,0.0,0.0,256,201.0,30,0.0
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,329,171.0,63,0.0
764,2.0,122.0,70.0,27.0,0.0,368,34.0,27,0.0
765,5.0,121.0,72.0,23.0,112.0,262,245.0,30,0.0
766,1.0,126.0,60.0,0.0,0.0,301,349.0,47,1.0


## Construcción del modelo

In [109]:
dataX =datoslimpios[["Outcome"]]
X_train = np.array(dataX)
y_train = datoslimpios['BloodPressure'].values
 
# Creamos el objeto de Regresión Linear
regr = linear_model.LinearRegression()
 
# Entrenamos nuestro modelo
regr.fit(X_train, y_train)
 
# Hacemos las predicciones que en definitiva una línea (en este caso, al ser 2D)
y_pred = regr.predict(X_train)
 
# Veamos los coeficienetes obtenidos, En nuestro caso, serán la Tangente
print('Coefficients: \n', regr.coef_)
# Este es el valor donde corta el eje Y (en X=0)
print('Independent term: \n', regr.intercept_)
# Error Cuadrado Medio
print("Mean squared error: %.2f" % mean_squared_error(y_train, y_pred))
# Puntaje de Varianza. El mejor puntaje es un 1.0
print('Variance score: %.2f' % r2_score(y_train, y_pred))

Coefficients: 
 [3.86873536]
Independent term: 
 71.47142857142856
Mean squared error: 147.13
Variance score: 0.02


In [110]:
modelname="Regresion Logistica"

In [111]:
x = datoslimpios.drop('Outcome',axis = 1)
y = datoslimpios.Outcome

In [112]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4)

In [113]:
lr = LogisticRegression(solver='lbfgs',max_iter=1000)

In [114]:
lr.fit(x_train,y_train)

LogisticRegression(max_iter=1000)

In [115]:
y_pred=lr.predict(x_test)

In [116]:
# Información del modelo
# ==============================================================================
print("Intercept:", lr.intercept_)
print("Coeficiente:", list(zip(x.columns, lr.coef_.flatten(), )))
print("Accuracy:", lr.score(x, y))

Intercept: [-5.73956645]
Coeficiente: [('Pregnancies', 0.08766362217511935), ('Glucose', 0.031067724743495635), ('BloodPressure', -0.0057010252105401645), ('SkinThickness', 0.015529699519410462), ('Insulin', -0.0007732721552504637), ('BMI', 0.0006887392511786101), ('DiabetesPedigreeFunction', 0.0006724547842818316), ('Age', 0.014007010811195836)]
Accuracy: 0.7439759036144579


In [117]:
# Predicciones probabilísticas
# ==============================================================================
# Con .predict_proba() se obtiene, para cada observación, la probabilidad predicha
# de pertenecer a cada una de las dos clases.
predicciones = lr.predict_proba( x_test)
predicciones = pd.DataFrame(predicciones, columns = lr.classes_)
predicciones.head(5)

Unnamed: 0,0.0,1.0
0,0.837125,0.162875
1,0.702828,0.297172
2,0.80554,0.19446
3,0.530132,0.469868
4,0.700286,0.299714


In [118]:
# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# A la matriz de predictores se le tiene que añadir una columna de 1s para el intercept del modelo
x_train = sm.add_constant(x_train, prepend=True)
lr = sm.Logit(endog=y_train, exog=x_train,)
lr = lr.fit()
print(lr.summary())

Optimization terminated successfully.
         Current function value: 0.525860
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  498
Model:                          Logit   Df Residuals:                      489
Method:                           MLE   Df Model:                            8
Date:                Fri, 03 Sep 2021   Pseudo R-squ.:                  0.1889
Time:                        12:20:53   Log-Likelihood:                -261.88
converged:                       True   LL-Null:                       -322.86
Covariance Type:            nonrobust   LLR p-value:                 1.304e-22
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -5.7396      0.800     -7.176      0.000      -7.307

In [119]:
# Intervalos de confianza para los coeficientes del modelo
# ==============================================================================
intervalos_ci = lr.conf_int(alpha=0.05)
intervalos_ci = pd.DataFrame(intervalos_ci)
intervalos_ci.columns = ['2.5%', '97.5%']
intervalos_ci


Unnamed: 0,2.5%,97.5%
const,-7.307282,-4.17201
Pregnancies,0.015434,0.160112
Glucose,0.022724,0.039411
BloodPressure,-0.025814,0.014419
SkinThickness,5e-06,0.031044
Insulin,-0.002802,0.001256
BMI,-0.001408,0.002785
DiabetesPedigreeFunction,4.1e-05,0.001304
Age,-0.008698,0.036687
