In [137]:
import numpy as np
import matplotlib as plt
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix
import pandas as pd
import seaborn as sns

<h3> Carga del dataset </h3>

In [138]:
# Data set cargado.
iris = load_iris()
x = iris.data
y = iris.target
# Transformelo a data frame para visualizarlo
df = pd.DataFrame(x,columns = iris.feature_names)
df['species_id'] = y
species_map = {0:'setosa',1:'versicolor',2:'virginica'}
df['species_name'] = df['species_id'].map(species_map)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species_id,species_name
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


<h3>Clasificación según Setosa, Versicolor, Virginica</h3>

<h5>Umbral T de pertenencia a la clase definido para este caso es de 0.5</h5>

In [139]:
setosa_mask = df['species_name'] == 'setosa'
versicolor_mask = df['species_name'] == 'versicolor'
virginica_mask = df['species_name'] == 'virginica'

In [140]:
# Codigo Setosa y no Setosa
y_setosa = np.where(y==2, 1, y) 

# Codigo Versicolor y no Versicolor
y_versicolor = np.where(y==2, 0, y) 

# Codigo Virginica y no Virginica
y_virginica = np.where(y==1, 0, y)

<h3>Regresión Logistica</h3>

<h6> Caso Setosa <7h6>

In [141]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
x_train_setosa, x_test_setosa, y_train_setosa, y_test_setosa = train_test_split(x, y_setosa, test_size=0.33, random_state=42)
log_reg = LogisticRegression()
log_reg.fit(x_train_setosa,y_train_setosa)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [142]:
y_pred_setosa = np.where(log_reg.predict_proba(x_test_setosa)[:,1] > 0.5, 1, 0)
C1 = confusion_matrix(y_test_setosa, y_pred_setosa)
C1

array([[19,  0],
       [ 0, 31]], dtype=int64)

<h6>Caso Versicolor</h6>

In [143]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
x_train_versicolor, x_test_versicolor, y_train_versicolor, y_test_versicolor = train_test_split(x, y_versicolor, test_size=0.33, random_state=42)
log_reg1 = LogisticRegression()
log_reg1.fit(x_train_versicolor,y_train_versicolor)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [144]:
y_pred_versicolor = np.where(log_reg1.predict_proba(x_test_versicolor)[:,1] > 0.5, 1, 0)
C2 = confusion_matrix(y_test_versicolor, y_pred_versicolor)
C2

array([[30,  5],
       [10,  5]], dtype=int64)

<h6>Caso Virginica</h6>

In [145]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
x_train_virginica, x_test_virginica, y_train_virginica, y_test_virginica = train_test_split(x, y_virginica, test_size=0.33, random_state=42)
log_reg2 = LogisticRegression()
log_reg2.fit(x_train_virginica,y_train_virginica)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [146]:
y_pred_virginica = np.where(log_reg2.predict_proba(x_test_virginica)[:,1] > 0.5, 2, 0)
C3 = confusion_matrix(y_test_virginica, y_pred_virginica)
C3

array([[34,  0],
       [ 0, 16]], dtype=int64)

<h2>Métricas para mátrices de confusión</h2>

$$Precision = \frac{TP}{TP + FP}$$


$$Recall = \frac{TP}{TP + FN}$$

$$FMeasure = 2*\frac{precision*recall}{precision+recall}$$

$$P_1 = 1  , P_2 = 0.857  , P_3 = 1$$

$$R_1 = 1  ,  R_2 = 0.75  , R_3 = 1 $$

$$FM_1 = 1  ,  FM_2 = 0.799  , FM_3 = 1 $$

<h2> Cambio del umbral de probabilidad a 0.95 </h2>

<h6> Caso Setosa <7h6>

In [147]:
y_pred_setosa = np.where(log_reg.predict_proba(x_test_setosa)[:,1] > 0.95, 1, 0)
C1_95 = confusion_matrix(y_test_setosa, y_pred_setosa)
C1_95

array([[19,  0],
       [ 1, 30]], dtype=int64)

<h6>Caso Versicolor</h6>

In [148]:
y_pred_versicolor = np.where(log_reg1.predict_proba(x_test_versicolor)[:,1] > 0.95, 1, 0)
C2_95 = confusion_matrix(y_test_versicolor, y_pred_versicolor)
C2_95

array([[35,  0],
       [15,  0]], dtype=int64)

<h6>Caso Virginica</h6>

In [149]:
y_pred_virginica = np.where(log_reg2.predict_proba(x_test_virginica)[:,1] > 0.95, 2, 0)
C3_95 = confusion_matrix(y_test_virginica, y_pred_virginica)
C3_95

array([[34,  0],
       [11,  5]], dtype=int64)

$$P_1 = 1  , P_2 = 1  , P_3 = 1$$

$$R_1 = 0.95  ,  R_2 = 0.7  , R_3 = 0.755 $$

Aumenta la precisión y disminuye el Recall

<h2>¿ Cómo varian las matrices de confusión si aplicamos Normalizaciones ? </h2>

<h3>1er Caso: StandardScaler</h3>

In [125]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
x_train_virginica, x_test_virginica, y_train_virginica, y_test_virginica = train_test_split(x, y_virginica, test_size=0.33, random_state=42)
log_reg3 = LogisticRegression()

scaler = StandardScaler()
X_train_virginica = scaler.fit_transform(x_train_virginica)

log_reg3.fit(X_train_virginica,y_train_virginica)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [124]:
Y_pred_virginica = np.where(log_reg3.predict_proba(x_test_virginica)[:,1] > 0.5, 2, 0)
C4 = confusion_matrix(y_test_virginica, Y_pred_virginica)
C4

array([[ 6, 28],
       [ 0, 16]], dtype=int64)

In [126]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
x_train_setosa, x_test_setosa, y_train_setosa, y_test_setosa = train_test_split(x, y_setosa, test_size=0.33, random_state=42)
log_reg4 = LogisticRegression()

scaler = StandardScaler()
X_train_setosa = scaler.fit_transform(x_train_setosa)

log_reg4.fit(X_train_setosa,y_train_setosa)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [128]:
Y_pred_setosa = np.where(log_reg4.predict_proba(x_test_setosa)[:,1] > 0.5, 1, 0)
C5 = confusion_matrix(y_test_setosa, Y_pred_setosa)
C5

array([[ 0, 19],
       [ 0, 31]], dtype=int64)

<h3> 2do Caso: MinMaxScaler </h3>

In [130]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
x_train_virginica, x_test_virginica, y_train_virginica, y_test_virginica = train_test_split(x, y_virginica, test_size=0.33, random_state=42)
log_reg5 = LogisticRegression()

scaler = MinMaxScaler()
X_train_virginica = scaler.fit_transform(x_train_virginica)

log_reg5.fit(X_train_virginica,y_train_virginica)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [131]:
Y_pred_virginica = np.where(log_reg5.predict_proba(x_test_virginica)[:,1] > 0.5, 2, 0)
C6 = confusion_matrix(y_test_virginica, Y_pred_virginica)
C6

array([[ 0, 34],
       [ 0, 16]], dtype=int64)

In [132]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
x_train_setosa, x_test_setosa, y_train_setosa, y_test_setosa = train_test_split(x, y_setosa, test_size=0.33, random_state=42)
log_reg6 = LogisticRegression()

scaler = MinMaxScaler()
X_train_setosa = scaler.fit_transform(x_train_setosa)

log_reg6.fit(X_train_setosa,y_train_setosa)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [133]:
Y_pred_setosa = np.where(log_reg6.predict_proba(x_test_setosa)[:,1] > 0.5, 1, 0)
C7 = confusion_matrix(y_test_setosa, Y_pred_setosa)
C7

array([[ 0, 19],
       [ 0, 31]], dtype=int64)