In [1]:
# Módulos básicos para análisis y manipulación de datos
import numpy as np
import pandas as pd

# Modelos de regresión y clasificación
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

# Preprocesamiento de datos
from sklearn.preprocessing import MinMaxScaler

# Módulos para evaluación de modelos
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Bases de datos del Lab
import faraway.datasets.ozone as ozone
import faraway.datasets.prostate as prostate 

# Problem 1

In [2]:
ozone_data = ozone.load()

 ### 1.1

In [4]:
train = None
test = None

# split data test and train
train, test = train_test_split(ozone_data, test_size=0.3, random_state=111)

### 1.2

In [12]:
# Variables predictoras y variable respuesta
X_train = train.iloc[:, 1:]
y_train = train.iloc[:, 0]

X_test = test.iloc[:, 1:]
y_test = test.iloc[:, 0]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(231, 9) (231,) (99, 9) (99,)


In [13]:
mse_train = np.zeros(3)
mse_test = np.zeros(3)

In [14]:
# Linear Regression
model_lr = LinearRegression(fit_intercept=True)
model_lr.fit(X_train, y_train)

# predict on train data
y_pred_train = model_lr.predict(X_train)

# predict on test data
y_pred_test = model_lr.predict(X_test)

# calculate MSE for train and test
mse_train[0] = metrics.mean_squared_error(y_train, y_pred_train)
mse_test[0] = metrics.mean_squared_error(y_test, y_pred_test)

print("MSE Train:", mse_train)
print("MSE Test:", mse_test)

MSE Train: [18.45860066  0.          0.        ]
MSE Test: [21.26808197  0.          0.        ]


In [15]:
# scale data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KNN Regression
model_knn = KNeighborsRegressor(n_neighbors=10, metric='euclidean')
model_knn.fit(X_train_scaled, y_train)

# predict on train data
y_pred_train = model_knn.predict(X_train_scaled)

# predict on test data
y_pred_test = model_knn.predict(X_test_scaled)

# calculate MSE for train and test
mse_train[1] = metrics.mean_squared_error(y_train, y_pred_train)
mse_test[1] = metrics.mean_squared_error(y_test, y_pred_test)

print("MSE Train:", mse_train)
print("MSE Test:", mse_test)

MSE Train: [18.45860066 14.19372294  0.        ]
MSE Test: [21.26808197 18.20272727  0.        ]


In [16]:
# Decision Tree Regression
model_dtr = DecisionTreeRegressor(max_depth=4, min_samples_split=20, random_state=123)
model_dtr.fit(X_train, y_train)

# predict on train data
y_pred_train = model_dtr.predict(X_train)

# predict on test data
y_pred_test = model_dtr.predict(X_test)

# calculate MSE for train and test
mse_train[2] = metrics.mean_squared_error(y_train, y_pred_train)
mse_test[2] = metrics.mean_squared_error(y_test, y_pred_test)

print("MSE Train:", mse_train)
print("MSE Test:", mse_test)

MSE Train: [18.45860066 14.19372294 11.91900354]
MSE Test: [21.26808197 18.20272727 21.75492747]


### 1.3

In [17]:
best_model_train = None
best_model_test = None
Interpretacion = None

best_model_train = 'DTree'
best_model_test = 'k-NN'
Interpretacion = "Como los modelos difieren se prefiere utilizar el modelo escogido con la muestra de validación ya que tiene menos posibilidad de caer en un sobreajuste."

## Cross Validation

### 1.4

In [18]:
# define vectors

mse_cv_mean = np.zeros(3)
mse_cv_std = np.zeros(3)

In [19]:
# Linear Regression

model_cv_lr = LinearRegression(fit_intercept=True)

# Score cross-validation

scores = cross_val_score(model_cv_lr,
                         X_train,
                         y_train,
                         cv=5,
                         scoring=metrics.make_scorer(
                             metrics.mean_squared_error))

# calculate mean and standard deviation of cross-validation scores
mse_cv_mean[0] = scores.mean()
mse_cv_std[0] = scores.std()

print("MSE CV Mean:", mse_cv_mean)
print("MSE CV Std:", mse_cv_std)

MSE CV Mean: [21.31410448  0.          0.        ]
MSE CV Std: [3.64662541 0.         0.        ]


In [20]:
# KNN Regression

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

k_cv = None
k_cv = np.zeros(15)

# bucle to find best k for KNN
for k in range(1, 16):
    model_cv_knn = KNeighborsRegressor(n_neighbors=k)
    scores = cross_val_score(model_cv_knn,
                             X_train_scaled,
                             y_train,
                             cv=5,
                             scoring=metrics.make_scorer(
                                 metrics.mean_squared_error))
    k_cv[k - 1] = scores.mean()

K = k_cv.argmin(0) + 1

print(k_cv)
print("Best k:", K)

model_cv_knn = KNeighborsRegressor(n_neighbors=K, metric='euclidean')

scores = cross_val_score(model_cv_knn,
                         X_train_scaled,
                         y_train,
                         cv=5,
                         scoring=metrics.make_scorer(
                             metrics.mean_squared_error))

mse_cv_mean[1] = scores.mean()
mse_cv_std[1] = scores.std()

print(f"MSE CV Mean: {mse_cv_mean}")
print(f"MSE CV Std: {mse_cv_std}")

[25.90888067 21.05844126 19.21257067 18.15708256 17.16918779 17.2481293
 17.34501123 17.12673884 17.55151609 17.07500925 17.23373216 17.60420457
 18.12715653 17.83182003 18.02906404]
Best k: 10
MSE CV Mean: [21.31410448 17.07500925  0.        ]
MSE CV Std: [3.64662541 2.32110945 0.        ]


In [21]:
# decision Tree

model_dtr = DecisionTreeRegressor(max_depth=4,
                                  min_samples_split=20,
                                  random_state=123)

scores = cross_val_score(model_dtr,
                         X_train,
                         y_train,
                         cv=5,
                         scoring=metrics.make_scorer(
                             metrics.mean_squared_error))

mse_cv_mean[2] = scores.mean()
mse_cv_std[2] = scores.std()

print(f"MSE CV Mean: {mse_cv_mean}")
print(f"MSE CV Std: {mse_cv_std}")

MSE CV Mean: [21.31410448 17.07500925 25.14949524]
MSE CV Std: [3.64662541 2.32110945 2.77881028]


### 1.5

In [23]:
best_model_cv = None
best_rmse_test = None

best_model_cv = 'k-NN'
best_rmse_test = mse_test[1]**0.5
best_rmse_test2 = np.sqrt(mse_cv_mean[1])

print(f"Best Model: {best_model_cv}")
print(f"Best RMSE Test: {best_rmse_test}")
print(f"Best RMSE Test 2: {best_rmse_test2}")

Best Model: k-NN
Best RMSE Test: 4.266465430860453
Best RMSE Test 2: 4.132191821623701


### 1.6

In [25]:
# decision Tree Importance

model_dtr = DecisionTreeRegressor(max_depth=4,
                                  min_samples_split=20,
                                  random_state=123)

model_dtr.fit(X_train, y_train)

model_dtr.feature_importances_

X_train.columns

feature_importances = pd.Series(model_dtr.feature_importances_, index=X_train.columns)
top5 = feature_importances.sort_values(ascending=False).head(5)
top5

temp    0.686486
ibt     0.133312
ibh     0.089194
dpg     0.057476
vis     0.018217
dtype: float64

In [37]:
list = ['temp', 'ibt', 'ibh', 'dpg', 'vis']

In [40]:
# p-values linear regression
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

model_lr_ols = sm.OLS(y_train, X_train).fit()
sigVars = model_lr_ols.pvalues < 0.05
top5_significance = sigVars[['temp', 'ibt', 'ibh', 'dpg', 'vis']]
top5_significance

temp     True
ibt     False
ibh     False
dpg     False
vis     False
dtype: bool

# Problem 2

In [41]:
prostate_data = prostate.load()

### 2.1

In [45]:
prostate_data[prostate_data['svi'] == 1]

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
38,2.660959,4.0851,68,1.373716,1,1.83258,7,35,2.21375
46,2.727853,3.9954,79,1.879465,1,2.65676,9,100,2.56879
61,1.997418,3.7197,63,1.619388,1,1.90954,7,40,2.85359
63,2.034706,3.917,66,2.008214,1,2.11021,7,60,2.882
70,1.86408,3.5932,60,-1.386294,1,1.32176,7,60,3.01308
72,1.214913,3.8254,69,-1.386294,1,0.22314,7,20,3.05636
73,1.838961,3.2367,60,0.438255,1,1.17865,9,90,3.07501
74,2.999226,3.8491,69,-1.386294,1,1.90954,7,20,3.27526
75,3.141131,3.2638,68,-0.051293,1,2.42037,7,50,3.33755
78,2.6483,3.5821,69,-1.386294,1,2.584,7,70,3.45789


In [46]:
train2 = None
test2 = None

# split data
train2, test2 = train_test_split(prostate_data, test_size=0.3, random_state=12345, stratify=prostate_data['svi'])

In [47]:
train2

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
5,-1.049822,3.2288,50,-1.386294,0,-1.38629,6,0,0.76547
81,2.513656,3.4735,57,0.438255,0,2.32728,7,60,3.53076
53,2.127041,4.1215,68,1.766442,0,1.44692,7,40,2.69124
8,-0.776529,3.5395,47,-1.386294,0,-1.38629,6,0,1.04732
60,0.457425,4.5245,73,2.326302,0,-1.38629,6,0,2.84200
...,...,...,...,...,...,...,...,...,...
74,2.999226,3.8491,69,-1.386294,1,1.90954,7,20,3.27526
17,2.288486,3.6494,66,-1.386294,0,0.37156,6,0,1.49290
59,1.061257,3.8512,61,1.294727,0,-1.38629,7,40,2.81241
65,1.458615,3.8362,61,1.321756,0,-0.43078,7,20,2.88759


In [49]:
print("Proporciones en todo el dataset:")
print(prostate_data['svi'].value_counts(normalize=True))

print("\nProporciones en train2:")
print(train2['svi'].value_counts(normalize=True))

print("\nProporciones en test2:")
print(test2['svi'].value_counts(normalize=True))

Proporciones en todo el dataset:
svi
0    0.783505
1    0.216495
Name: proportion, dtype: float64

Proporciones en train2:
svi
0    0.776119
1    0.223881
Name: proportion, dtype: float64

Proporciones en test2:
svi
0    0.8
1    0.2
Name: proportion, dtype: float64


### 2.2

In [50]:
X_train2 = train2.drop(columns=['svi'])
y_train2 = train2['svi']

X_test2 = test2.drop(columns=['svi'])
y_test2 = test2['svi']

In [52]:
performance_metrics = pd.DataFrame(columns=['accuracy', 'recall', 'precision', 'f1_score'], index=['logit', 'knn', 'dtree'])


In [53]:
accuracy_train2 = np.zeros(3)
accuracy_test2 = np.zeros(3)
recall_train2 = np.zeros(3)
recall_test2 = np.zeros(3)
precision_train2 = np.zeros(3)
precision_test2 = np.zeros(3)
f1_train2 = np.zeros(3)
f1_test2 = np.zeros(3)

In [58]:
# Logistic Regression

model_log_regression = LogisticRegression(solver='newton-cg',
                                          fit_intercept=True)

# Fit
model_log_regression.fit(X_train2, y_train2)

# Predict
y_pred_train2 = model_log_regression.predict(X_train2)
y_pred_test2 = model_log_regression.predict(X_test2)

# Metrics
# Accuracy
accuracy_train2[0] = metrics.accuracy_score(y_train2, y_pred_train2)
accuracy_test2[0] = metrics.accuracy_score(y_test2, y_pred_test2)
# Recall
recall_train2[0] = metrics.recall_score(y_train2, y_pred_train2)
recall_test2[0] = metrics.recall_score(y_test2, y_pred_test2)
# Precision
precision_train2[0] = metrics.precision_score(y_train2, y_pred_train2)
precision_test2[0] = metrics.precision_score(y_test2, y_pred_test2)
# F1
f1_train2[0] = metrics.f1_score(y_train2, y_pred_train2)
f1_test2[0] = metrics.f1_score(y_test2, y_pred_test2)

print('='*40, "TRAIN", '='*40)
print(f"Accuracy Train: {accuracy_train2}")
print(f"Recall Train: {recall_train2}")
print(f"Precision Train: {precision_train2}")
print(f"F1 Train: {f1_train2}")
print('='*40, "TEST", '='*40)
print(f"Accuracy Test: {accuracy_test2}")
print(f"Recall Test: {recall_test2}")
print(f"Precision Test: {precision_test2}")
print(f"F1 Test: {f1_test2}")

Accuracy Train: [0.89552239 0.         0.        ]
Recall Train: [0.73333333 0.         0.        ]
Precision Train: [0.78571429 0.         0.        ]
F1 Train: [0.75862069 0.         0.        ]
Accuracy Test: [0.9 0.  0. ]
Recall Test: [0.66666667 0.         0.        ]
Precision Test: [0.8 0.  0. ]
F1 Test: [0.72727273 0.         0.        ]


In [59]:

scaler = MinMaxScaler()
X_train2_scaled = scaler.fit_transform(X_train2)
X_test2_scaled = scaler.transform(X_test2)


# KNN Regression K = 10
model_knn_regression = KNeighborsClassifier(n_neighbors=10, metric='euclidean')

# Fit
model_knn_regression.fit(X_train2_scaled, y_train2)

# Predict
y_pred_train2 = model_knn_regression.predict(X_train2_scaled)
y_pred_test2 = model_knn_regression.predict(X_test2_scaled)

# Metrics
# Accuracy
accuracy_train2[1] = metrics.accuracy_score(y_train2, y_pred_train2)
accuracy_test2[1] = metrics.accuracy_score(y_test2, y_pred_test2)
# Recall
recall_train2[1] = metrics.recall_score(y_train2, y_pred_train2)
recall_test2[1] = metrics.recall_score(y_test2, y_pred_test2)
# Precision
precision_train2[1] = metrics.precision_score(y_train2, y_pred_train2)
precision_test2[1] = metrics.precision_score(y_test2, y_pred_test2)
# F1
f1_train2[1] = metrics.f1_score(y_train2, y_pred_train2)
f1_test2[1] = metrics.f1_score(y_test2, y_pred_test2)

print('='*40, "TRAIN", '='*40)
print(f"Accuracy Train: {accuracy_train2}")
print(f"Recall Train: {recall_train2}")
print(f"Precision Train: {precision_train2}")
print(f"F1 Train: {f1_train2}")
print('='*40, "TEST", '='*40)
print(f"Accuracy Test: {accuracy_test2}")
print(f"Recall Test: {recall_test2}")
print(f"Precision Test: {precision_test2}")
print(f"F1 Test: {f1_test2}")



Accuracy Train: [0.89552239 0.86567164 0.        ]
Recall Train: [0.73333333 0.53333333 0.        ]
Precision Train: [0.78571429 0.8        0.        ]
F1 Train: [0.75862069 0.64       0.        ]
Accuracy Test: [0.9        0.83333333 0.        ]
Recall Test: [0.66666667 0.5        0.        ]
Precision Test: [0.8 0.6 0. ]
F1 Test: [0.72727273 0.54545455 0.        ]


In [60]:
# Decision Tree Classifier

# model
model_dtc = DecisionTreeClassifier(max_depth=4,
                                   min_samples_split=10,
                                   random_state=123)

# Fit
model_dtc.fit(X_train2, y_train2)

# Predict
y_pred_train2 = model_dtc.predict(X_train2)
y_pred_test2 = model_dtc.predict(X_test2)

# Metrics
# Accuracy
accuracy_train2[2] = metrics.accuracy_score(y_train2, y_pred_train2)
accuracy_test2[2] = metrics.accuracy_score(y_test2, y_pred_test2)
# Recall
recall_train2[2] = metrics.recall_score(y_train2, y_pred_train2)
recall_test2[2] = metrics.recall_score(y_test2, y_pred_test2)
# Precision
precision_train2[2] = metrics.precision_score(y_train2, y_pred_train2)
precision_test2[2] = metrics.precision_score(y_test2, y_pred_test2)
# F1
f1_train2[2] = metrics.f1_score(y_train2, y_pred_train2)
f1_test2[2] = metrics.f1_score(y_test2, y_pred_test2)

print('='*40, "TRAIN", '='*40)
print(f"Accuracy Train: {accuracy_train2}")
print(f"Recall Train: {recall_train2}")
print(f"Precision Train: {precision_train2}")
print(f"F1 Train: {f1_train2}")
print('='*40, "TEST", '='*40)
print(f"Accuracy Test: {accuracy_test2}")
print(f"Recall Test: {recall_test2}")
print(f"Precision Test: {precision_test2}")
print(f"F1 Test: {f1_test2}")

Accuracy Train: [0.89552239 0.86567164 0.95522388]
Recall Train: [0.73333333 0.53333333 1.        ]
Precision Train: [0.78571429 0.8        0.83333333]
F1 Train: [0.75862069 0.64       0.90909091]
Accuracy Test: [0.9        0.83333333 0.76666667]
Recall Test: [0.66666667 0.5        0.5       ]
Precision Test: [0.8        0.6        0.42857143]
F1 Test: [0.72727273 0.54545455 0.46153846]


In [61]:
performance_metrics

Unnamed: 0,accuracy,recall,precision,f1_score
logit,,,,
knn,,,,
dtree,,,,


In [62]:
performance_metrics = pd.DataFrame(columns=['accuracy', 'recall', 'precision', 'f1_score'], index=['logit', 'knn', 'dtree'])

In [63]:
performance_metrics.loc['logit'] = [
    accuracy_test2[0],
    recall_test2[0],
    precision_test2[0],
    f1_test2[0]
]

performance_metrics.loc['knn'] = [
    accuracy_test2[1],
    recall_test2[1],
    precision_test2[1],
    f1_test2[1]
]

performance_metrics.loc['dtree'] = [
    accuracy_test2[2],
    recall_test2[2],
    precision_test2[2],
    f1_test2[2]
]

performance_metrics

Unnamed: 0,accuracy,recall,precision,f1_score
logit,0.9,0.666667,0.8,0.727273
knn,0.833333,0.5,0.6,0.545455
dtree,0.766667,0.5,0.428571,0.461538


In [74]:
performance_metrics.iloc[0,0]

np.float64(0.9)

### 2.3

In [64]:
mse_cv_mean = np.zeros(3)
mse_cv_std = np.zeros(3)

In [70]:
# Logistic Regression

model_log_regression = LogisticRegression(solver='newton-cg',
                                          fit_intercept=True)

# Score Cross Validation

scores_logit = cross_val_score(model_log_regression, X_train2, y_train2, cv=5, scoring=metrics.make_scorer(metrics.fbeta_score, beta=10, zero_division=0))

mse_cv_mean[0] = scores_logit.mean()
mse_cv_std[0] = scores_logit.std()

print("MSE CV Mean:", mse_cv_mean)
print("MSE CV Std:", mse_cv_std)

MSE CV Mean: [0.73377483 0.53399558 0.53355699]
MSE CV Std: [0.13311533 0.33997907 0.33980901]


In [71]:
# Knn
# scaler

scaler = MinMaxScaler()
X_train2_scaled = scaler.fit_transform(X_train2)
X_test2_scaled = scaler.transform(X_test2)

model_knn_classifier = KNeighborsClassifier(n_neighbors=10, metric='euclidean')

scores_knn = cross_val_score(model_knn_classifier, X_train2_scaled, y_train2, cv=5, scoring=metrics.make_scorer(metrics.fbeta_score, beta=10, zero_division=0))

mse_cv_mean[1] = scores_knn.mean()
mse_cv_std[1] = scores_knn.std()

print("MSE CV Mean:", mse_cv_mean)
print("MSE CV Std:", mse_cv_std)

MSE CV Mean: [0.73377483 0.53399558 0.53355699]
MSE CV Std: [0.13311533 0.33997907 0.33980901]


In [72]:
# Decision Tree Classifier

model_decision_tree_classifier = DecisionTreeClassifier(max_depth=4, min_samples_split=10, random_state=123)

scores_dtree = cross_val_score(model_decision_tree_classifier, X_train2, y_train2, cv=5, scoring=metrics.make_scorer(metrics.fbeta_score, beta=10, zero_division=0))

mse_cv_mean[2] = scores_dtree.mean()
mse_cv_std[2] = scores_dtree.std()

print("MSE CV Mean:", mse_cv_mean)
print("MSE CV Std:", mse_cv_std)


MSE CV Mean: [0.73377483 0.53399558 0.53355699]
MSE CV Std: [0.13311533 0.33997907 0.33980901]


In [73]:
f_score = pd.DataFrame(
    {
        'logit': [scores_logit.mean(), scores_logit.std()],
        'knn': [scores_knn.mean(), scores_knn.std()],
        'dtree': [scores_dtree.mean(), scores_dtree.std()]
    },
    index=['f_score_mean', 'f_score_std'])
f_score

Unnamed: 0,logit,knn,dtree
f_score_mean,0.733775,0.533996,0.533557
f_score_std,0.133115,0.339979,0.339809


In [75]:
best_model = 'logit'
best_accuracy = performance_metrics.iloc[0, 0]

print(best_model)
print(best_accuracy)

logit
0.9
