In [1]:
# Estos dos comandos evitan que haya que hacer reload cada vez que se modifica un paquete
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

Anotaciones:
Desde el lado de Estadistica querriamos ver que tan bueno se aproxima a una gaussiana. Desde ML que tan bien generaliza

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from collections import Counter
from mpl_toolkits.mplot3d import Axes3D

In [4]:
data = pd.read_csv('data/alturas-pesos-mils-train.csv')

In [5]:
test = pd.read_csv('data/alturas-pesos-mils-test.csv')

In [6]:
from BiVariateJoint import BiVariateJoint

In [7]:
data_hombres = data.loc[data['Genero']=='Hombre'][['Peso','Altura']].values
data_mujeres = data.loc[data['Genero']=='Mujer'][['Peso','Altura']].values

# Bayes Gaussiano (Quadratic Discriminant Analisys)

In [9]:
peso_altura_hombres = data.loc[data['Genero']=='Hombre'][['Peso', 'Altura']].values
peso_altura_mujeres = data.loc[data['Genero']=='Mujer'][['Peso', 'Altura']].values

In [10]:
mean_hombres = peso_altura_hombres.mean(axis=0)
mean_mujeres = peso_altura_mujeres.mean(axis=0)

In [11]:
cov_hombres = np.cov(peso_altura_hombres.T)
cov_mujeres = np.cov(peso_altura_mujeres.T)

In [12]:
from scipy.stats import multivariate_normal

In [13]:
def get_gauss_prob(data, mean_hombres, mean_mujeres, cov_hombres, cov_mujeres):
    data_np = data
    likelihood_class_1 = multivariate_normal.pdf(data_np, mean_hombres, cov_hombres)
    likelihood_class_2 = multivariate_normal.pdf(data_np, mean_mujeres, cov_mujeres)
    N_class_1 = len(peso_altura_hombres)
    N_class_2 = len(peso_altura_mujeres)
    prior_1 = N_class_1/(N_class_1 + N_class_2)
    prior_2 = N_class_2/(N_class_1 + N_class_2)
    total = likelihood_class_1 * prior_1 + likelihood_class_2 * prior_2
    p_class_1 = likelihood_class_1 * prior_1/total
    p_class_2 = likelihood_class_2 * prior_2/total
    return p_class_1, p_class_2

In [14]:
def get_acc_gauss(data, mean_hombres, mean_mujeres, cov_hombres, cov_mujeres):
    p_class_1, p_class_2 = get_gauss_prob(data[['Peso', 'Altura']].values, mean_hombres, mean_mujeres, cov_hombres, cov_mujeres)
    return ((p_class_1>p_class_2)==(data['Genero']=='Hombre')).sum()/len(p_class_1)

In [15]:
%time get_acc_gauss(data, mean_hombres, mean_mujeres, cov_hombres, cov_mujeres)

CPU times: user 10.1 ms, sys: 3.29 ms, total: 13.4 ms
Wall time: 15.4 ms


0.91675

In [16]:
print(get_acc_gauss(test, mean_hombres, mean_mujeres, cov_hombres, cov_mujeres))

0.9225


In [24]:
N = 300
X = np.linspace(data.min()['Peso'], data.max()['Peso'], N)
Y = np.linspace(data.min()['Altura'], data.max()['Altura'], N)
X, Y = np.meshgrid(X, Y)
# Pack X and Y into a single 3-dimensional array
pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X
pos[:, :, 1] = Y

In [25]:
Z_H = multivariate_normal.pdf(pos, mean_hombres, cov_hombres)
Z_M = multivariate_normal.pdf(pos, mean_mujeres, cov_mujeres)
Z_gaus, _ = get_gauss_prob(pos, mean_hombres, mean_mujeres, cov_hombres, cov_mujeres)

In [32]:
%matplotlib qt
fig = plt.figure(figsize=(20,10))
ax = fig.gca()
cm = plt.cm.RdBu
cf = ax.contourf(X, Y, Z_gaus, 256, alpha=.8, vmin=0., vmax=1., cmap=cm)
plt.colorbar(cf, ax=ax)
z_levels = np.logspace(-5,-2,10)/4
#ax.contour(X, Y, Z_H, z_levels)
#ax.contour(X, Y, Z_M, z_levels)
ax.contour(X, Y, Z_gaus, (0.5,), colors='k', linewidths=1)
ax.set_ylabel('Alturas [cms]')
ax.set_xlabel('Pesos [kgs]')
plt.show()

# LDA

In [22]:
mean_hombres = peso_altura_hombres.mean(axis=0)
mean_mujeres = peso_altura_mujeres.mean(axis=0)
cov_LDA = np.cov(np.vstack([peso_altura_hombres - mean_hombres, peso_altura_mujeres - mean_mujeres]).T)

In [27]:
Z_H_LDA = multivariate_normal.pdf(pos, mean_hombres, cov_LDA)
Z_M_LDA = multivariate_normal.pdf(pos, mean_mujeres, cov_LDA)
Z_LDA, _ = get_gauss_prob(pos, mean_hombres, mean_mujeres, cov_LDA, cov_LDA)

In [38]:
f, ax = plt.subplots(2,2, figsize=(20,10))

In [42]:
ax = ax.reshape(-1)

In [43]:
ax

array([<matplotlib.axes._subplots.AxesSubplot object at 0x1141f2d68>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x113ac9438>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x1a2740fac8>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x1a27428198>],
      dtype=object)

# Naive Bayes Gaussiano

In [53]:
cov_hombres_nb = np.cov(peso_altura_hombres.T)*np.identity(2)
cov_mujeres_nb = np.cov(peso_altura_mujeres.T)*np.identity(2)

In [54]:
def get_prob_naive(data):
    data_np = data
    likelihood_class_1 = multivariate_normal.pdf(data_np, mean_hombres, cov_hombres*np.identity(2))
    likelihood_class_2 = multivariate_normal.pdf(data_np, mean_mujeres, cov_mujeres*np.identity(2))
    N_class_1 = len(peso_altura_hombres)
    N_class_2 = len(peso_altura_mujeres)
    prior_1 = N_class_1/(N_class_1 + N_class_2)
    prior_2 = N_class_2/(N_class_1 + N_class_2)
    total = likelihood_class_1 * prior_1 + likelihood_class_2 * prior_2
    p_class_1 = likelihood_class_1 * prior_1/total
    p_class_2 = likelihood_class_2 * prior_2/total
    return p_class_1, p_class_2

In [55]:
Z_H_nb = multivariate_normal.pdf(pos, mean_hombres, cov_hombres*np.identity(2))
Z_M_nb = multivariate_normal.pdf(pos, mean_mujeres, cov_mujeres*np.identity(2))
Z_nb, _ = get_prob_naive(pos)

In [58]:
def get_acc_gauss_naive(data):
    p_class_1, p_class_2 = get_prob_naive(data[['Peso', 'Altura']].values)
    return ((p_class_1>p_class_2)==(data['Genero']=='Hombre')).sum()/len(p_class_1)

In [59]:
get_acc_gauss_naive(data)

0.887

In [60]:
get_acc_gauss_naive(test)

0.887

# Regresion Logistica

In [62]:
from sklearn.linear_model import LogisticRegression

In [63]:
log_Reg = LogisticRegression()

In [64]:
log_Reg.fit(data[['Peso', 'Altura']].values, data['Genero'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [65]:
log_Reg.score(data[['Peso', 'Altura']].values, data['Genero'])

0.916625

In [66]:
log_Reg.score(test[['Peso', 'Altura']].values, test['Genero'])

0.922

In [70]:
Z_lr = log_Reg.predict_proba(np.c_[X.ravel(), Y.ravel()])[:, 0]
Z_lr = Z_lr.reshape(X.shape)

In [72]:
%matplotlib qt
alpha = 0.5
f, ax = plt.subplots(2,2, figsize=(20,10))
ax = ax.reshape(-1)
cm = plt.cm.RdBu
ax[0].contourf(X, Y, Z_gaus, 256, alpha=.8, vmin=0., vmax=1., cmap=cm)
ax[0].contour(X, Y, Z_gaus, (0.5,), colors='k', linewidths=1)
ax[1].contourf(X, Y, Z_LDA, 256, alpha=.8, vmin=0., vmax=1., cmap=cm)
ax[1].contour(X, Y, Z_LDA, (0.5,), colors='k', linewidths=1)
ax[2].contourf(X, Y, Z_nb, 256, alpha=.8, vmin=0., vmax=1., cmap=cm)
ax[2].contour(X, Y, Z_nb, (0.5,), colors='k', linewidths=1)
ax[3].contourf(X, Y, Z_lr, 256, alpha=.8, vmin=0., vmax=1., cmap=cm)
ax[3].contour(X, Y, Z_lr, (0.5,), colors='k', linewidths=1)

plt.colorbar(cf, ax=ax)
z_levels = np.logspace(-5,-2,10)/4
for a in ax:
    a.scatter(data_hombres[:,0], data_hombres[:,1], color='b', s=2, alpha=alpha)
    a.scatter(data_mujeres[:,0], data_mujeres[:,1], color='r', s=2, alpha=alpha)
plt.show()

In [92]:
%matplotlib qt
fig = plt.figure(figsize=(20,10))
ax = fig.gca()
cm = plt.cm.RdBu
# cf = ax.contourf(X, Y, Z_lr, 256, alpha=.5, vmin=0., vmax=1., cmap=cm)
cs1 = ax.contour(X, Y, Z_lr, (0.5,), colors='b', linewidths=1)
cs2 = ax.contour(X, Y, Z_gaus, (0.5,), colors='g', linewidths=1)
cs3 = ax.contour(X, Y, Z_nb, (0.5,), colors='r', linewidths=1)
cs4 = ax.contour(X, Y, Z_LDA, (0.5,), colors='k', linewidths=1)
#ax.contour(X, Y, Z_H, z_levels, linewidths=0.5)
#ax.contour(X, Y, Z_M, z_levels, linewidths=0.5)
plt.scatter(data_hombres[:,0], data_hombres[:,1], color='b', s=2, alpha=0.5)
plt.scatter(data_mujeres[:,0], data_mujeres[:,1], color='r', s=2, alpha=0.5)
ax.set_ylabel('Alturas [cms]')
ax.set_xlabel('Pesos [kgs]')
cs1.collections[0].set_label("Regresión Logística")
cs2.collections[0].set_label("Bayes Gaussiano")
cs3.collections[0].set_label("Naive Bayes Gaussiano")
cs4.collections[0].set_label("LDA")
plt.legend()
plt.show()

|Modelo| tipo| Train Acc| CV Acc|Comentarios
|-| -| -| -|-|
|Histogram - step 0.25|Bayes|0.97|0.68
|Histogram - step 0.25|Naive Bayes|0.89|0.88
|Histogram - step 0.5|Bayes|0.94|0.82
|Histogram - step 0.5|Naive Bayes|0.89|0.88
|Histogram - step 1|Bayes|0.92|0.90
|Histogram - step 1|Naive Bayes|0.89|0.88
|Histogram - step 4|Bayes|0.91|0.92| No generaliza por afuera de la zona de los puntos
|Histogram - step 4|Naive Bayes|0.89|0.88
|Histogram - step 8|Bayes|0.90|0.90
|Histogram - step 8|Naive Bayes|0.87|0.86
|Gaussian|Bayes|0.92|0.92|
|Gaussian|Naive Bayes|0.89|0.89|
|Regresion Logística|-|0.92|0.92|