In [46]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [47]:
path = './resources/Datos_Etapa_1_csv.csv'
with open(path, 'r') as file:
    content = file.read()

content = content.replace('"', '')  # Eliminar las comillas dobles

with open(path, 'w') as file:
    file.write(content)

In [48]:
#Carga y verificacion del CSV 
data_original = pd.read_csv(path, sep=',')
data_copy = data_original.copy()
print(data_copy.shape)
data_copy['season'] = data_copy['season'].astype('category').cat.codes
data_copy['weathersit'] = data_copy['weathersit'].astype('category').cat.codes
data_copy['time_of_day'] = data_copy['time_of_day'].astype('category').cat.codes
data_copy.head()


(17379, 9)


Unnamed: 0,season,weekday,weathersit,temp,atemp,hum,windspeed,cnt,time_of_day
0,3,6,0,3.28,3.0014,0.81,0.0,16,2
1,3,6,0,2.34,1.9982,0.8,0.0,40,2
2,3,6,0,2.34,1.9982,0.8,0.0,32,2
3,3,6,0,3.28,3.0014,0.75,0.0,13,2
4,3,6,0,3.28,3.0014,0.75,0.0,1,2


In [49]:
#eliminar columnas categoricas mientras se construye el modelo
#data_copy=data_copy.drop(['season','weathersit','time_of_day'], axis=1)
print(data_copy.shape)
data_copy.head()

(17379, 9)


Unnamed: 0,season,weekday,weathersit,temp,atemp,hum,windspeed,cnt,time_of_day
0,3,6,0,3.28,3.0014,0.81,0.0,16,2
1,3,6,0,2.34,1.9982,0.8,0.0,40,2
2,3,6,0,2.34,1.9982,0.8,0.0,32,2
3,3,6,0,3.28,3.0014,0.75,0.0,13,2
4,3,6,0,3.28,3.0014,0.75,0.0,1,2


In [50]:
#verificar data incompleta
data_copy.isna().sum()

season         0
weekday        0
weathersit     0
temp           0
atemp          0
hum            0
windspeed      0
cnt            0
time_of_day    0
dtype: int64

In [51]:
#verificar registros duplicados 
data_copy.duplicated().sum()

np.int64(42)

In [52]:
#eliminar registros replicados
data_copy = data_copy.drop_duplicates()
print(data_copy.shape)

(17337, 9)


In [53]:
#SEPARCION DE VARIABLE OBJETIVO DE INDEPENDIENTES   ---------------------------------------------- REGRESION CON REGULACION LASSO
train_lasso, test_lasso = train_test_split(data_copy, test_size=0.2, random_state=77)
x_train_lasso = train_lasso.drop(['cnt'], axis = 1)
y_train_lasso = train_lasso['cnt']
#ESTANDARIZACIÓN DE LOS DATOS
columns = x_train_lasso.columns  
#creacion de objeto StandardScaler()
scaler = StandardScaler()
x_train_lasso = pd.DataFrame(scaler.fit_transform(x_train_lasso), columns = columns)
#BUSQUEDA DE HIPERPARAMETRO Y ENTRENAMIENTO
kfold = KFold(n_splits=10, shuffle= True, random_state=0)
#creacion objeto clase lasso
lasso = Lasso()
#definición de busqueda de hiperparametro
valores_alpha = [1, 2, 3, 4, 5]
param_grid_lasso = {'alpha': valores_alpha}
grid_lasso = GridSearchCV(lasso, param_grid_lasso, cv = kfold, n_jobs = 1, scoring= 'neg_mean_squared_error')
grid_lasso.fit(x_train_lasso, y_train_lasso)


In [54]:
#mostrar mejor valor alfa 
mejor_modelo_lasso = grid_lasso.best_estimator_
x_test_lasso = test_lasso.drop(['cnt'], axis = 1)
y_test_lasso = test_lasso['cnt']
x_test_lasso = pd.DataFrame(scaler.transform(x_test_lasso), columns= columns)

print("mejor parametro lasso:", grid_lasso.best_params_)
list(zip(x_train_lasso.columns, mejor_modelo_lasso.coef_))

mejor parametro lasso: {'alpha': 1}


[('season', np.float64(-23.45085255050671)),
 ('weekday', np.float64(3.0876278152166963)),
 ('weathersit', np.float64(-2.0062102355466447)),
 ('temp', np.float64(27.911962792088026)),
 ('atemp', np.float64(26.7347235057036)),
 ('hum', np.float64(-31.29749050425788)),
 ('windspeed', np.float64(-1.1885955663536987)),
 ('time_of_day', np.float64(-76.63857841704939))]

In [55]:
#predicciones
y_pred_lasso = mejor_modelo_lasso.predict(x_test_lasso)
#manejor de errores
rmse_lasso = mean_squared_error(y_test_lasso, y_pred_lasso, squared = False)
mae_lasso = mean_absolute_error(y_test_lasso, y_pred_lasso)
r2_lasso = r2_score(y_test_lasso, y_pred_lasso)

print(y_pred_lasso)
print(["RMSE:", rmse_lasso], ["MAE:", mae_lasso], ["R2:", r2_lasso])

[ 80.57869498 122.85920633 240.2443417  ... 237.21321699 250.52898995
 170.00367744]
['RMSE:', np.float64(140.2024018429416)] ['MAE:', np.float64(103.80299404474125)] ['R2:', 0.41525682828042443]




In [56]:
#SEPARCION DE VARIABLE OBJETIVO DE INDEPENDIENTES Y SEPARACIÓN TEST ENTRENAMIENTO ---------------------------------------------- POLINOMIAL
x_pol = data_copy.drop(['cnt'], axis=1)
y_pol = data_copy['cnt']
x_train_pol, x_test_pol, y_train_pol, y_test_pol = train_test_split(x_pol, y_pol, test_size=0.2, random_state=77)
#BUSQUEDA DE HIPER PARAMETRO
polynomial_regression = make_pipeline(
    PolynomialFeatures(),
    RobustScaler(),
    LinearRegression()
)
kfold = KFold(n_splits=10, shuffle= True, random_state=0)
#espacio de busqueda 
valores_busqueda = [2,3]
param_grid_pol = {'polynomialfeatures__degree': valores_busqueda}
grid_pol = GridSearchCV(polynomial_regression, param_grid_pol, cv = kfold, n_jobs = 1, scoring= 'neg_mean_squared_error')
grid_pol.fit(x_train_pol, y_train_pol)

print("mejor parametro polinomial: ", grid_pol.best_params_)

mejor parametro polinomial:  {'polynomialfeatures__degree': 3}


In [57]:
mejor_modelo_pol = grid_pol.best_estimator_
#prediciones
y_pred_pol = mejor_modelo_pol.predict(x_test_pol)
#manejo de errores
rmse_pol = mean_squared_error(y_test_pol, y_pred_pol, squared = False)
mae_pol = mean_absolute_error(y_test_pol, y_pred_pol)
r2_pol = r2_score(y_test_pol, y_pred_pol)

print(y_pred_pol)
print(["RMSE:", rmse_pol], ["MAE",mae_pol], ["R2:", r2_pol])

[ 50.74029662 148.60774939 303.2100328  ... 242.83897657 251.50121205
 199.24301463]
['RMSE:', np.float64(133.1037163427306)] ['MAE', np.float64(96.7771163741743)] ['R2:', 0.47297087692420936]




In [58]:
print("ERRORES LASSO:")
print(["RMSE:", rmse_lasso], ["MAE",mae_lasso], ["R2:", r2_lasso])
print("ERRORES POLYNOMIAL:")
print(["RMSE:", rmse_pol], ["MAE:", mae_pol], ["R2:", r2_pol])

ERRORES LASSO:
['RMSE:', np.float64(140.2024018429416)] ['MAE', np.float64(103.80299404474125)] ['R2:', 0.41525682828042443]
ERRORES POLYNOMIAL:
['RMSE:', np.float64(133.1037163427306)] ['MAE:', np.float64(96.7771163741743)] ['R2:', 0.47297087692420936]
