En este ejercicio, intentaremos predecir usando un modelo lineal, el valor de la cotizacion del IBEX a partir de otras variables de la capital catalana

In [None]:
# !pip3 install apafib
# !pip3 install plotly

Cargamos los modulos necesarios

In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import matplotlib as mpl

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn import set_config

from sklearn.metrics import  ConfusionMatrixDisplay,\
                  classification_report,  RocCurveDisplay, PrecisionRecallDisplay,\
                    accuracy_score, f1_score, precision_score, recall_score


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.naive_bayes import BernoulliNB, GaussianNB, CategoricalNB, MultinomialNB

from sklearn.model_selection import GridSearchCV

from yellowbrick.target.feature_correlation import feature_correlation
from yellowbrick.classifier import precision_recall_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import LocallyLinearEmbedding
from IPython.display import display, HTML

Leemos los datos de APAfib

In [None]:
from apafib import load_BCN_IBEX
data = load_BCN_IBEX()
data.head()

In [None]:
data.describe(include='all').T

Vemos la distribucion de los datos por cada columna

In [None]:
fig, axes = plt.subplots(8,2,figsize=(15,50))

data_frame = pd.DataFrame(data)
data_frame.columns=data.columns
for i, c in enumerate(data.columns):
    ax = axes.reshape(-1)[i]
    if data_frame[c].dtype.kind == 'O':
        a = sns.countplot(x=c,data=data_frame,ax=ax)
    else:
        b = sns.histplot(x=c,data=data_frame,ax=ax)
plt.tight_layout()

Separamos los datos de la variable objetivo, y creamos los conjuntos de training y test

In [None]:
X= data.iloc[:,:-1]
y= data.loc[:,'Mercat bursàtil: IBEX-35 / '].copy()

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Aplicamos el analisis de componentes principales (PCA), para ver si podemos reducir la dimensionalidad del problema sin perder much varianza por el camino:

In [None]:
pca = PCA().fit(X_train)

In [None]:
fig = plt.figure(figsize=(8,6));
plt.plot(range(1,len(pca.explained_variance_ratio_ )+1),pca.explained_variance_ratio_ ,alpha=0.8,marker='.',label="Variancia Explicada");
y_label = plt.ylabel('Variancia explicada');
x_label = plt.xlabel('Componentes');
plt.plot(range(1,len(pca.explained_variance_ratio_ )+1),
         np.cumsum(pca.explained_variance_ratio_),
         c='red',marker='.',
         label="Variancia explicada acumulativa");
plt.legend();
plt.title('Porcentaje de variancia explicada por componente');

Podemos ver que con 2 componentes podemos explicar un 90% de la varianza, cosa que no está mal, peró es ampliamente mejorable

In [None]:
X_trans = pca.transform(X_train)
plt.figure(figsize=(8,8));
sns.scatterplot(x=X_trans[:,0], y=X_trans[:,1], hue=y_train)

Si lo reporesentamos en 2 dimensiones, podemos observar cumulos de datos que, más o menos, se separan por valores, de todas formas, hay muchas zonas donde no se acaban de separar los puntos, seguramente mas dimensiones ayudarian a aclarar la confusión

Una vez visualizados los datos, vamos a entrenar tres modelos lineales y ver que tan bien pueden ajustarse al modelo y, finalmente, predecir la variable objetivo. Todo sea dicho, dado que antes hemos observado que nuestros datos no siguen una distribucion normal, es de esperar que el desempeño de los modelos sea mas bien mediocre

In [None]:
from sklearn.linear_model import LinearRegression, PoissonRegressor, Lasso, LassoCV
from sklearn.model_selection import train_test_split,  KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

Empezaremos entrenado en modelo de Regresion lineal

In [None]:
lr = LinearRegression().fit(X_train,y_train)
r2_lr = np.mean(cross_val_score(lr.fit(X_train,y_train),X_train,y_train))

r2_results = pd.DataFrame({'lr':r2_lr},index=['CV R2'])
r2_results.loc['Test R2', :] =[r2_score(y_test,lr.predict(X_test))]
r2_results.loc['lambda','lr']=0
r2_results

No obtenemos un mal resultado, pero es muy mejorable.

Vamos a ver si los siguentes modelos lo hacen mejor, veamos el resultado de la regresion Ridge

In [None]:
lambdas = [1e-3,1e-2,0.1, 0.5,1,5,10,50,100]

In [None]:
ridge_cv = RidgeCV(alphas=lambdas, cv=5).fit(X_train, y_train)

r2_ridge = np.mean(cross_val_score(ridge_cv, X_train, y_train, cv=5, scoring='r2'))

r2_results = pd.DataFrame({'RidgeRegression':r2_ridge},index=['CV R2'])
r2_results.loc['Test R2', :] =[r2_score(y_test,ridge_cv.predict(X_test))]
r2_results.loc['lambda','RidgeRegression']=10
r2_results

Como la regresión lineal, el resultado no es del todo malo, pero deja mucho que desear

Probamos para terminar con la regresión LASSO

In [None]:
lasso_cv =LassoCV(alphas=lambdas,cv=5,tol=1e-1).fit(X_train,y_train)

r2_lasso =  np.mean(cross_val_score(lasso_cv, X_train,y_train))

r2_results = pd.DataFrame({'LASSO':r2_lasso},index=['CV R2'])
r2_results.loc['Test R2', :] =[r2_score(y_test,lasso_cv.predict(X_test))]
r2_results.loc['lambda','LASSO']=lasso_cv.alpha_
r2_results

Igual que los dos anteriores, obtenemos resultados muy parecidos

In [None]:
import re


r2_results = pd.DataFrame({'lr':r2_lr, 'Ridge':r2_ridge, 'LASSO':r2_lasso},index=['CV R2'])

r2_results.loc['Test R2', :] =[r2_score(y_test,lr.predict(X_test)),
                                r2_score(y_test,ridge_cv.predict(X_test)),
                                r2_score(y_test,lasso_cv.predict(X_test))]
r2_results.loc['lambda','lr']='N/A'
r2_results.loc['lambda','Ridge']=ridge_cv.alpha_
r2_results.loc['lambda','LASSO']=lasso_cv.alpha_
r2_results

Podemos representar como difieren los valores predichos por cada modelos de los datos reales de test, asi como los qqplots de cada uno

In [None]:
from yellowbrick.regressor import residuals_plot
plt.figure(figsize=(12,8));
viz = residuals_plot(lr, X_train, y_train, X_test, y_test, is_fitted=True, qqplot=True, hist=False);
plt.figure(figsize=(12,8));
viz = residuals_plot(ridge_cv, X_train, y_train, X_test, y_test, is_fitted=True, qqplot=True, hist=False);
plt.figure(figsize=(12,8));
viz = residuals_plot(lasso_cv, X_train, y_train, X_test, y_test, is_fitted=True, qqplot=True, hist=False);

Para tratar de entender los modelos podemos visualizar los pesos que le ha asignado cada modelo a cada una de las variables de los datos.

In [None]:
weights = pd.DataFrame({'lr':lr.coef_, 'LASSO':lasso_cv.coef_, 'ridge': ridge_cv.coef_},index=X_train.columns)
weights.T

In [None]:
fig,  ax = plt.subplots( figsize = (20,1));
sns.heatmap(weights.T.loc[['lr'],:].abs(),annot=True, linewidths=.5,ax=ax,cbar=False,xticklabels=False);

plt.figure(figsize = (20,1));
sns.heatmap(weights.T.loc[['LASSO'],:].abs(),annot=True, linewidths=.5,cbar=False,xticklabels=False);

plt.figure(figsize = (20,1));
sns.heatmap(weights.T.loc[['ridge'],:].abs(),annot=True, linewidths=.5,cbar=False,xticklabels=True);

Vemos que los tres modelos coinciden bastante en que variables son las relevantes y cuales no. Vemos claramente dos categorias, la formada por las variables con pesos > 100 y las con pesos < 50.


Las variables que los 3 modelos destacan no parece tener ninguna relación con la variables que queremos predecir, tan solo la variable que trata del IPC internaual podria tener algun sentido.
Tambien, los 3 modelos, desestiman las variables del transito en el aeropuerto del part, que, a priori, podrian parecer mas importantes que las demás.

Probaremos de eliminar las variables de menor peso, y reajustar los modelos.

In [55]:
X_train = X_train.drop(columns=["Compra-Venda d'habitatges a Barcelona (Nombre de transmissions) / Habitatges",'Dades meteorològiques: Precipitació / Barcelona - Zona Universitària',
       'Matriculacions de vehicles a Barcelona / Turismes', 'Preu electricitat (majorista) / '])

X_test = X_test.drop(columns=["Compra-Venda d'habitatges a Barcelona (Nombre de transmissions) / Habitatges",'Dades meteorològiques: Precipitació / Barcelona - Zona Universitària',
       'Matriculacions de vehicles a Barcelona / Turismes', 'Preu electricitat (majorista) / '])

In [63]:
lr = LinearRegression().fit(X_train,y_train);
r2_lr = np.mean(cross_val_score(lr.fit(X_train,y_train),X_train,y_train))

lasso_cv =LassoCV(alphas=lambdas,cv=5,tol=1e-1).fit(X_train,y_train)
r2_lasso =  np.mean(cross_val_score(lasso_cv, X_train,y_train))

ridge_cv = RidgeCV(alphas=lambdas, cv=5).fit(X_train, y_train)
r2_ridge = np.mean(cross_val_score(ridge_cv, X_train, y_train, cv=5, scoring='r2'))

r2_results = pd.DataFrame({'lr':r2_lr, 'Ridge':r2_ridge, 'LASSO':r2_lasso},index=['CV R2'])

r2_results.loc['Test R2', :] =[r2_score(y_test,lr.predict(X_test)),
                                r2_score(y_test,ridge_cv.predict(X_test)),
                                r2_score(y_test,lasso_cv.predict(X_test))]
r2_results.loc['lambda','lr']='N/A'
r2_results.loc['lambda','Ridge']=ridge_cv.alpha_
r2_results.loc['lambda','LASSO']=lasso_cv.alpha_
r2_results

Unnamed: 0,lr,Ridge,LASSO
CV R2,0.60988,0.607433,0.610669
Test R2,0.610725,0.610916,0.610732
lambda,,0.01,0.001


Vemos que, no solo no han mejorados los resultados, sino que han empeorado ligeramente.

In [None]:
weights = pd.DataFrame({'lr':lr.coef_, 'LASSO':lasso_cv.coef_, 'ridge': ridge_cv.coef_},index=X_train.columns)
fig,  ax = plt.subplots( figsize = (20,1));
sns.heatmap(weights.T.loc[['lr'],:].abs(),annot=True, linewidths=.5,ax=ax,cbar=False,xticklabels=False);

plt.figure(figsize = (20,1));
sns.heatmap(weights.T.loc[['LASSO'],:].abs(),annot=True, linewidths=.5,cbar=False,xticklabels=False);

plt.figure(figsize = (20,1));
sns.heatmap(weights.T.loc[['ridge'],:].abs(),annot=True, linewidths=.5,cbar=False,xticklabels=True);

Podemos ver que la distribucion de los pesos es bastante parecida.

In [102]:
from sklearn.preprocessing import PolynomialFeatures    
from sklearn.pipeline import make_pipeline

model_lasso = make_pipeline(PolynomialFeatures(degree=2), LassoCV(alphas=lambdas,cv=5,tol=1e-1))
# model_ridge = make_pipeline(PolynomialFeatures(degree=2), RidgeCV(alphas=lambdas, cv=5).fit(X_train, y_train))


model_lasso.fit(X_train, y_train)
# model_ridge.fit(X_train, y_train)
print(model_lasso.score(X_train, y_train))

t = PolynomialFeatures(degree=2)
d = t.fit_transform(X_train)
f = t.fit_transform(X_test)
# print(len(y_train))
lasso_cv =LassoCV(alphas=lambdas,cv=5,tol=1e-1).fit(d,y_train)

r2_lasso =  np.mean(cross_val_score(lasso_cv, d,y_train))
# len(lasso_cv.predict(d))
r2_results = pd.DataFrame({'LASSO':r2_lasso},index=['CV R2'])
r2_results.loc['Test R2', :] =[r2_score(y_test,lasso_cv.predict(f))]
r2_results.loc['lambda','LASSO']=lasso_cv.alpha_
r2_results

# weights = pd.DataFrame({'LASSO':model_lasso[1].coef_, 'Ridge': model_ridge[1].coef_},index=[i for i in range(len(model_lasso[1].coef_))])

# plt.figure(figsize = (30,1));
# sns.heatmap(weights.T.loc[['LASSO'],:].abs(),annot=True, linewidths=.5,cbar=False,xticklabels=False);

# plt.figure(figsize = (30,1));
# sns.heatmap(weights.T.loc[['Ridge'],:].abs(),annot=True, linewidths=.5,cbar=False,xticklabels=True);


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gra

0.810062544886031


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gra

Unnamed: 0,LASSO
CV R2,0.712516
Test R2,0.685825
lambda,1.0


In [None]:
from yellowbrick.regressor import prediction_error

In [None]:
plt.figure(figsize=(8,8))
visualizer = prediction_error(ridge_cv, X_test, y_test, is_fitted=True)

In [None]:
plt.figure(figsize=(8,8))
visualizer = prediction_error(lr, X_test, y_test, is_fitted=True)

In [None]:
plt.figure(figsize=(8,8))
visualizer = prediction_error(lasso_cv, X_test, y_test, is_fitted=True)

In [None]:
from scipy import stats

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
stats.probplot(lr.resid, plot=plt);

In [None]:
weights = pd.DataFrame({'lr':lr.coef_, 'ridge_cv':ridge.coef_, 'lasso_cv':lasso_cv.coef_},index=X_train.columns)
fig,  ax = plt.subplots( figsize = (20,1));
sns.heatmap(weights.T.loc[['lr'],:].abs(),annot=True, linewidths=.5,ax=ax,cbar=False,xticklabels=False);

plt.figure(figsize = (20,1));
sns.heatmap(weights.T.loc[['ridge_cv'],:].abs(),annot=True, linewidths=.5,cbar=False,xticklabels=False);

plt.figure(figsize = (20,1));
sns.heatmap(weights.T.loc[['ridge_cv'],:].abs(),annot=True, linewidths=.5,cbar=False,xticklabels=True);