In [1]:
# Estos dos comandos evitan que haya que hacer reload cada vez que se modifica un paquete
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

Anotaciones:
Desde el lado de Estadistica querriamos ver que tan bueno se aproxima a una gaussiana. Desde ML que tan bien generaliza

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from collections import Counter
from mpl_toolkits.mplot3d import Axes3D

In [3]:
data = pd.read_csv('data/alturas-pesos-mils-train.csv')[:200]

In [4]:
test = pd.read_csv('data/alturas-pesos-mils-test.csv')

In [5]:
from BiVariateJoint import BiVariateJoint

In [6]:
data_hombres = data.loc[data['Genero']=='Hombre'][['Peso','Altura']].values
data_mujeres = data.loc[data['Genero']=='Mujer'][['Peso','Altura']].values

# Grafico los datos

In [7]:
%matplotlib qt
f = plt.figure(figsize=(20,10))
plt.scatter(data_hombres[:,0], data_hombres[:,1], color='b', s=2, label='Hombres')
plt.scatter(data_mujeres[:,0], data_mujeres[:,1], color='r', s=2, label='Mujeres')
plt.xlabel('Pesos [cms]')
plt.ylabel('Alturas [cms]')
plt.legend()
plt.show()

# LDA

### Log-likelihood

\begin{equation}
\large
log(\frac{\pi_1}{\pi_0}) - (x-\mu_0)^T\Sigma_0^{-1}(x-\mu_0) + (x-\mu_1)^T\Sigma_1^{-1}(x-\mu_1)  + ln(\Sigma_0) - ln(\Sigma_1) = 0 \\
\end{equation}

\begin{equation}
\large
log(\frac{\pi_1}{\pi_0}) - (x-\mu_0)^T\Sigma^{-1}(x-\mu_0) + (x-\mu_1)^T\Sigma^{-1}(x-\mu_1) = 0 \\
\end{equation}

\begin{equation}
\large
log(\frac{\pi_1}{\pi_0}) - \frac{1}{2}(\mu_1+\mu_0)^T\Sigma^{-1}(\mu_1-\mu_0) + x^T\Sigma^{-1}(\mu_1+\mu_0) = 0
\end{equation}

Suponiendo $\Sigma_0=\Sigma_1$
\begin{equation}
WX = c
\end{equation}

\begin{equation}
W = \Sigma^{-1}(\mu_1-\mu_0)
\end{equation}

\begin{equation}
c = \frac{1}{2}(T - \mu_0^T \Sigma^{-1}\mu_0 + \mu_1^T \Sigma^{-1}\mu_1)
\end{equation}

# Regresion Logistica

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
log_Reg = LogisticRegression()

In [10]:
log_Reg.fit(data[['Peso', 'Altura']].values, data['Genero'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
data[['Peso', 'Altura']][:10]

Unnamed: 0,Peso,Altura
0,61.235,162.402
1,97.432,181.908
2,73.324,172.459
3,55.193,157.748
4,56.886,151.798
5,59.834,158.623
6,77.618,172.705
7,68.889,169.225
8,53.874,155.487
9,83.248,177.908


In [12]:
log_Reg.score(data[['Peso', 'Altura']].values, data['Genero'])

0.905

In [13]:
log_Reg.score(test[['Peso', 'Altura']].values, test['Genero'])

0.9225

In [14]:
log_Reg.predict_proba(test[['Peso', 'Altura']].values)

array([[0.99173494, 0.00826506],
       [0.02533864, 0.97466136],
       [0.08543259, 0.91456741],
       ...,
       [0.00490791, 0.99509209],
       [0.9466478 , 0.0533522 ],
       [0.03147486, 0.96852514]])

In [15]:
%matplotlib qt
N = 300
X = np.linspace(data.min()['Peso'], data.max()['Peso'], N)
Y = np.linspace(data.min()['Altura'], data.max()['Altura'], N)
X, Y = np.meshgrid(X, Y)

Z_lr = log_Reg.predict_proba(np.c_[X.ravel(), Y.ravel()])[:, 0]

# Put the result into a color plot
Z_lr = Z_lr.reshape(X.shape)

fig = plt.figure(figsize=(20,10))
ax = fig.gca()
cm = plt.cm.RdBu
cf = ax.contourf(X, Y, Z_lr, 256, alpha=.5, vmin=0., vmax=1., cmap=cm)
ax.contour(X, Y, Z_lr, (0.5,), colors='k', linewidths=0.5)
plt.scatter(data_hombres[:,0], data_hombres[:,1], color='b', s=2, alpha=0.5)
plt.scatter(data_mujeres[:,0], data_mujeres[:,1], color='r', s=2, alpha=0.5)
ax.set_ylabel('Alturas [cms]')
ax.set_xlabel('Pesos [kgs]')
plt.colorbar(cf, ax=ax)
plt.title('Regresión Logística')
plt.show()

|Modelo| tipo| Train Acc| CV Acc|Comentarios
|-| -| -| -|-|
|Histogram - step 0.25|Bayes|0.97|0.68
|Histogram - step 0.25|Naive Bayes|0.89|0.88
|Histogram - step 0.5|Bayes|0.94|0.82
|Histogram - step 0.5|Naive Bayes|0.89|0.88
|Histogram - step 1|Bayes|0.92|0.90
|Histogram - step 1|Naive Bayes|0.89|0.88
|Histogram - step 4|Bayes|0.91|0.92| No generaliza por afuera de la zona de los puntos
|Histogram - step 4|Naive Bayes|0.89|0.88
|Histogram - step 8|Bayes|0.90|0.90
|Histogram - step 8|Naive Bayes|0.87|0.86
|Gaussian|Bayes|0.92|0.92|
|Gaussian|Naive Bayes|0.89|0.89|
|Regresion Logística|-|0.92|0.92|