In [2]:
!pip install faraway

Collecting faraway
  Obtaining dependency information for faraway from https://files.pythonhosted.org/packages/d6/d8/c04eb2ae7a628fd4d0a4557903dc1b903769a6e1d05c8af8e290b5181849/faraway-0.0.6-py3-none-any.whl.metadata
  Downloading faraway-0.0.6-py3-none-any.whl.metadata (2.9 kB)
Downloading faraway-0.0.6-py3-none-any.whl (224 kB)
   ---------------------------------------- 0.0/224.5 kB ? eta -:--:--
   ---------------------------------------- 224.5/224.5 kB 6.7 MB/s eta 0:00:00
Installing collected packages: faraway
Successfully installed faraway-0.0.6


In [3]:
# Módulos básicos para análisis y manipulación de datos
import numpy as np
import pandas as pd

# Modelos de regresión y clasificación
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

# Preprocesamiento de datos
from sklearn.preprocessing import MinMaxScaler

# Módulos para evaluación de modelos
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Bases de datos del Lab
import faraway.datasets.divusa as divusa

In [4]:
data = divusa.load() # paquete faraway
data.head()

Unnamed: 0,year,divorce,unemployed,femlab,marriage,birth,military
0,1920,8.0,5.2,22.7,92.0,117.9,3.2247
1,1921,7.2,11.7,22.79,83.0,119.8,3.5614
2,1922,6.6,6.7,22.88,79.7,111.2,2.4553
3,1923,7.1,2.4,22.97,85.2,110.5,2.2065
4,1924,7.2,5.0,23.06,80.3,110.9,2.2889


In [5]:
data.drop('year', axis=1, inplace=True)
data.head()

Unnamed: 0,divorce,unemployed,femlab,marriage,birth,military
0,8.0,5.2,22.7,92.0,117.9,3.2247
1,7.2,11.7,22.79,83.0,119.8,3.5614
2,6.6,6.7,22.88,79.7,111.2,2.4553
3,7.1,2.4,22.97,85.2,110.5,2.2065
4,7.2,5.0,23.06,80.3,110.9,2.2889


In [6]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [9]:
train.head()

Unnamed: 0,divorce,unemployed,femlab,marriage,birth,military
9,8.0,3.2,23.51,75.5,89.3,2.0944
5,7.2,3.2,23.15,79.2,106.6,2.1735
34,9.5,5.5,34.6,79.8,118.1,20.3343
22,10.1,4.7,30.9,93.0,91.5,28.6133
30,10.3,5.3,33.9,90.2,106.2,9.627


In [10]:
mse_train = np.zeros(3) # creo un vector de ceros 
mse_test = np.zeros(3)
X = train.iloc[:, 1:]
y = train.iloc[:, 0]

# Regresión Lineal

In [11]:
lm_model = LinearRegression(fit_intercept=True)
lm_model.fit(X, y)

In [12]:
y_pred_train = lm_model.predict(X)
y_pred_test = lm_model.predict(test.iloc[:, 1:])

In [13]:
mse_train[0] = metrics.mean_squared_error(y, y_pred_train)
mse_test[0] = metrics.mean_squared_error(test.iloc[:, 0], y_pred_test)

In [14]:
print(mse_train)
print(mse_test)

[2.53247962 0.         0.        ]
[2.59041398 0.         0.        ]


# KNN

In [16]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X) # Escalado para el set de train
X_test_scaled = scaler.transform(test.iloc[:, 1:]) # Escalado para el set del test

In [17]:
knn_model = KNeighborsRegressor(n_neighbors=10, metric='euclidean') 
knn_model.fit(X_scaled, y)

In [18]:
y_pred_train = knn_model.predict(X_scaled)
y_pred_test = knn_model.predict(X_test_scaled)

In [19]:
mse_train[1] = metrics.mean_squared_error(y, y_pred_train)
mse_test[1] = metrics.mean_squared_error(test.iloc[:, 0], y_pred_test)

In [20]:
print(mse_train)
print(mse_test)

[2.53247962 1.67057541 0.        ]
[2.59041398 0.56124375 0.        ]


# DT

In [21]:

dt_model = DecisionTreeRegressor(max_depth=4, # máximo de profundidd, 
                                 min_samples_split=20, # mínimo de las muestras en la división
                                 random_state=123)
dt_model.fit(X, y)

In [22]:
y_pred_train = dt_model.predict(X)
y_pred_test = dt_model.predict(test.iloc[:, 1:])

In [24]:
mse_train[2] = metrics.mean_squared_error(y, y_pred_train)
mse_test[2] = metrics.mean_squared_error(test.iloc[:, 0], y_pred_test)

In [25]:
print(mse_train)
print(mse_test)

[2.53247962 1.67057541 0.91575339]
[2.59041398 0.56124375 0.87215844]


# CV

In [26]:
mse_cv_mean = np.zeros(3)
mse_cv_std = np.zeros(3)

LM

In [29]:
model = LinearRegression(fit_intercept=True)
scores = cross_val_score(model, X, y, 
                         cv = 10, # número de estimaciones en la validación cruzada y número de cajas de división
                         scoring = metrics.make_scorer(metrics.mean_squared_error) # métrica de evaluación
                         )
mse_cv_mean[0] = scores.mean()
mse_cv_std[0] = scores.std()

In [28]:
print(scores)

[1.48910859 1.29539762 4.22665016 2.46952385 1.33811311 4.53966091
 7.43667788 2.27687076 4.18405806 2.67282593]


In [30]:
print(mse_cv_mean)
print(mse_cv_std)

[3.19288869 0.         0.        ]
[1.82896047 0.         0.        ]


# KNN

In [45]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(test.iloc[:, 1:])

In [None]:
k_cv = None
k_cv = np.zeros(15)
for k in range(1,16):
    model = KNeighborsRegressor(n_neighbors=k, metric='euclidean')
    scores = cross_val_score(model, X_scaled, y, 
                             scoring = metrics.make_scorer(metrics.mean_squared_error), 
                             cv=10)
    k_cv[k-1] = scores.mean()
K = k_cv.argmin(0) + 1

In [32]:
print(k_cv)

[1.32919048 1.30166071 1.36928307 1.43854167 1.55781619 1.61025595
 1.83155539 2.00285231 2.12997354 2.196555   2.50577706 2.7096045
 3.09450535 3.29648469 3.5331946 ]


In [33]:
print(K)

2


In [None]:
  model = KNeighborsRegressor(n_neighbors=K, metric='euclidean')
  scores = cross_val_score(model, X_scaled, y, scoring = metrics.make_scorer(metrics.mean_squared_error), 
                            cv=10)

In [36]:
mse_cv_mean[1] = scores.mean()
mse_cv_std[1] = scores.std()

In [37]:
print(mse_cv_mean)
print(mse_cv_std)

[3.19288869 1.30166071 0.        ]
[1.82896047 1.81220117 0.        ]


# DT

In [52]:
model = DecisionTreeRegressor(max_depth=4, min_samples_split=20, random_state=123)
scores = cross_val_score(model, X, y, scoring = metrics.make_scorer(metrics.mean_squared_error), cv=10)

In [53]:
mse_cv_mean[2] = scores.mean()
mse_cv_std[2] = scores.std()

In [54]:
print(mse_cv_mean)
print(mse_cv_std)

[3.19288869 1.30166071 4.22615998]
[1.82896047 1.81220117 2.44496131]


# DT importance

In [38]:
model = DecisionTreeRegressor(max_depth=4, min_samples_split=20, random_state=123)
model.fit(X, y)

In [40]:
model.feature_importances_

array([0.        , 0.88676646, 0.04836582, 0.01268331, 0.05218442])

In [41]:
X.columns

Index(['unemployed', 'femlab', 'marriage', 'birth', 'military'], dtype='object')

In [42]:
pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_})

Unnamed: 0,feature,importance
0,unemployed,0.0
1,femlab,0.886766
2,marriage,0.048366
3,birth,0.012683
4,military,0.052184
