In [1]:
# Módulos básicos para análisis y manipulación de datos
import numpy as np
import pandas as pd

# Modelos de regresión y clasificación
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

# Preprocesamiento de datos
from sklearn.preprocessing import MinMaxScaler

# Módulos para evaluación de modelos
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Bases de datos del Lab
import faraway.datasets.divusa as divusa

In [2]:
# load data

data = divusa.load()
data.head()

Unnamed: 0,year,divorce,unemployed,femlab,marriage,birth,military
0,1920,8.0,5.2,22.7,92.0,117.9,3.2247
1,1921,7.2,11.7,22.79,83.0,119.8,3.5614
2,1922,6.6,6.7,22.88,79.7,111.2,2.4553
3,1923,7.1,2.4,22.97,85.2,110.5,2.2065
4,1924,7.2,5.0,23.06,80.3,110.9,2.2889


In [3]:
# drop year column
data = data.drop(columns=["year"])
data.head()

Unnamed: 0,divorce,unemployed,femlab,marriage,birth,military
0,8.0,5.2,22.7,92.0,117.9,3.2247
1,7.2,11.7,22.79,83.0,119.8,3.5614
2,6.6,6.7,22.88,79.7,111.2,2.4553
3,7.1,2.4,22.97,85.2,110.5,2.2065
4,7.2,5.0,23.06,80.3,110.9,2.2889


In [4]:
# split data into train and test
train, test = train_test_split(data, test_size=0.2, random_state=42)
train.head()

Unnamed: 0,divorce,unemployed,femlab,marriage,birth,military
9,8.0,3.2,23.51,75.5,89.3,2.0944
5,7.2,3.2,23.15,79.2,106.6,2.1735
34,9.5,5.5,34.6,79.8,118.1,20.3343
22,10.1,4.7,30.9,93.0,91.5,28.6133
30,10.3,5.3,33.9,90.2,106.2,9.627


In [5]:
# create vector with zeros

mse_train = np.zeros(3)
mse_test = np.zeros(3)

In [6]:
# split data into features and target
X_train = train.iloc[:, 1:]
y_train = train.iloc[:, 0]
X_test = test.iloc[:, 1:]
y_test = test.iloc[:, 0]

# Linear Regression

In [7]:
# fit linear regression model
model_lg = LinearRegression(fit_intercept=True)
model_lg.fit(X_train, y_train)

In [8]:
# predict on train data
y_pred_train = model_lg.predict(X_train)

# predict on test data
y_pred_test = model_lg.predict(X_test)

In [9]:
# calculate mean squared error for train and test data
mse_train[0] = metrics.mean_squared_error(y_train, y_pred_train)
mse_test[0] = metrics.mean_squared_error(y_test, y_pred_test)

print("MSE Train:", mse_train[0])
print("MSE Test:", mse_test[0])

MSE Train: 2.5324796175658646
MSE Test: 2.590413983258592


# KNN

In [10]:
# scale data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
# fit model

model_knn = KNeighborsRegressor(n_neighbors=10, metric='euclidean')
model_knn.fit(X_train_scaled, y_train)

In [15]:
# predict on train data
y_pred_train = model_knn.predict(X_train_scaled)

# predict on test data
y_pred_test = model_knn.predict(X_test_scaled)

In [16]:
# calculate mean squared error for train and test data
mse_train[1] = metrics.mean_squared_error(y_train, y_pred_train)
mse_test[1] = metrics.mean_squared_error(y_test, y_pred_test)

print("MSE Train:", mse_train[1])
print("MSE Test:", mse_test[1])

MSE Train: 1.6705754098360655
MSE Test: 0.5612437500000005


# Decision Tree

In [18]:
# fit decision tree regressor model
model_dtr = DecisionTreeRegressor(max_depth=4, min_samples_split=20, random_state=123)

model_dtr.fit(X_train, y_train)

In [19]:
# predict on train data
y_pred_train = model_dtr.predict(X_train)

# predict on test data
y_pred_test = model_dtr.predict(X_test)

In [20]:
# calculate mean squared error for train and test data
mse_train[2] = metrics.mean_squared_error(y_train, y_pred_train)
mse_test[2] = metrics.mean_squared_error(y_test, y_pred_test)

print("MSE Train:", mse_train[2])
print("MSE Test:", mse_test[2])

MSE Train: 0.9157533878089633
MSE Test: 0.8721584397225357


In [21]:
# print vectors
print(mse_train)
print(mse_test)

[2.53247962 1.67057541 0.91575339]
[2.59041398 0.56124375 0.87215844]


# Cross Validation

In [22]:
# define vectors

mse_cv_mean = np.zeros(3)
mse_cv_std = np.zeros(3)

## Linear Regression

In [23]:
# Linear Regression

model_cv_lr = LinearRegression(fit_intercept=True)

In [24]:
# Score cross-validation

scores = cross_val_score(model_cv_lr,
                         X_train,
                         y_train,
                         cv=10,
                         scoring=metrics.make_scorer(
                             metrics.mean_squared_error))

scores

array([1.48910859, 1.29539762, 4.22665016, 2.46952385, 1.33811311,
       4.53966091, 7.43667788, 2.27687076, 4.18405806, 2.67282593])

In [26]:
# calculate mean and standard deviation of cross-validation scores
mse_cv_mean[0] = scores.mean()
mse_cv_std[0] = scores.std()

print("MSE CV Mean:", mse_cv_mean[0])
print("MSE CV Std:", mse_cv_std[0])

MSE CV Mean: 3.1928886872612665
MSE CV Std: 1.8289604662265955


## KNN

In [27]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
k_cv = None
k_cv = np.zeros(15)

# bucle to find best k for KNN
for k in range(1, 16):
    model_cv_knn = KNeighborsRegressor(n_neighbors=k)
    scores = cross_val_score(model_cv_knn,
                             X_train_scaled,
                             y_train,
                             cv=10,
                             scoring=metrics.make_scorer(
                                 metrics.mean_squared_error))
    k_cv[k - 1] = scores.mean()

K = k_cv.argmin(0) + 1

In [29]:
print(k_cv)
print("Best k:", K)

[1.32919048 1.30166071 1.36928307 1.43854167 1.55781619 1.61025595
 1.83155539 2.00285231 2.12997354 2.196555   2.50577706 2.7096045
 3.09450535 3.29648469 3.5331946 ]
Best k: 2


In [31]:
model_cv_knn = KNeighborsRegressor(n_neighbors=K, metric='euclidean')

scores = cross_val_score(model_cv_knn,
                         X_train_scaled,
                         y_train,
                         cv=10,
                         scoring=metrics.make_scorer(
                             metrics.mean_squared_error))

mse_cv_mean[1] = scores.mean()
mse_cv_std[1] = scores.std()

print(f"MSE CV Mean: {mse_cv_mean[1]}")
print(f"MSE CV Std: {mse_cv_std[1]}")

MSE CV Mean: 1.3016607142857137
MSE CV Std: 1.8122011727721212


## Decision Tree

In [32]:
model_dtr = DecisionTreeRegressor(max_depth=4,
                                  min_samples_split=20,
                                  random_state=123)

scores = cross_val_score(model_dtr,
                         X_train,
                         y_train,
                         cv=10,
                         scoring=metrics.make_scorer(
                             metrics.mean_squared_error))

mse_cv_mean[2] = scores.mean()
mse_cv_std[2] = scores.std()

print(f"MSE CV Mean: {mse_cv_mean[2]}")
print(f"MSE CV Std: {mse_cv_std[2]}")

MSE CV Mean: 4.226159984639493
MSE CV Std: 2.444961307265898


## Decision Tree Importance

In [33]:
model_dtr = DecisionTreeRegressor(max_depth=4,
                                  min_samples_split=20,
                                  random_state=123)

model_dtr.fit(X_train, y_train)

In [34]:
model_dtr.feature_importances_

array([0.        , 0.88676646, 0.04836582, 0.01268331, 0.05218442])

In [35]:
X_train.columns

Index(['unemployed', 'femlab', 'marriage', 'birth', 'military'], dtype='object')

In [36]:
pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model_dtr.feature_importances_
}).sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
1,femlab,0.886766
4,military,0.052184
2,marriage,0.048366
3,birth,0.012683
0,unemployed,0.0
