In [16]:
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, RobustScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

In [17]:
path = '../resources/Datos_Etapa_1_csv.csv'
with open(path, 'r') as file:
    content = file.read()

content = content.replace('"', '')  # Eliminar las comillas dobles

with open(path, 'w') as file:
    file.write(content)

In [18]:
#Carga y verificacion del CSV 
data_original = pd.read_csv(path, sep=',')
data_copy = data_original.copy()
print(data_copy.shape)
# data_copy['season'] = data_copy['season'].astype('category').cat.codes
# data_copy['weathersit'] = data_copy['weathersit'].astype('category').cat.codes
# data_copy['time_of_day'] = data_copy['time_of_day'].astype('category').cat.codes
categorical_columns = ['weekday','season', 'weathersit', 'time_of_day']
encoded_data = pd.get_dummies(data_copy, columns=categorical_columns)

categorical_encoded_columns = [col for col in encoded_data.columns if col not in data_copy.columns.difference(categorical_columns)]
encoded_data[categorical_encoded_columns] = encoded_data[categorical_encoded_columns].astype(int)

data_copy = encoded_data
# data.head()
data_copy.head()


(17379, 9)


Unnamed: 0,temp,atemp,hum,windspeed,cnt,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,...,season_Spring,season_Summer,season_Winter,weathersit_Clear,weathersit_Heavy Rain,weathersit_Light Rain,weathersit_Mist,time_of_day_Evening,time_of_day_Morning,time_of_day_Night
0,3.28,3.0014,0.81,0.0,16,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
1,2.34,1.9982,0.8,0.0,40,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
2,2.34,1.9982,0.8,0.0,32,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
3,3.28,3.0014,0.75,0.0,13,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
4,3.28,3.0014,0.75,0.0,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1


In [19]:
#eliminar columnas categoricas mientras se construye el modelo
#data_copy=data_copy.drop(['season','weathersit','time_of_day'], axis=1)
print(data_copy.shape)
data_copy.head()

(17379, 23)


Unnamed: 0,temp,atemp,hum,windspeed,cnt,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,...,season_Spring,season_Summer,season_Winter,weathersit_Clear,weathersit_Heavy Rain,weathersit_Light Rain,weathersit_Mist,time_of_day_Evening,time_of_day_Morning,time_of_day_Night
0,3.28,3.0014,0.81,0.0,16,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
1,2.34,1.9982,0.8,0.0,40,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
2,2.34,1.9982,0.8,0.0,32,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
3,3.28,3.0014,0.75,0.0,13,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
4,3.28,3.0014,0.75,0.0,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1


In [20]:
#verificar data incompleta
data_copy.isna().sum()

temp                     0
atemp                    0
hum                      0
windspeed                0
cnt                      0
weekday_0                0
weekday_1                0
weekday_2                0
weekday_3                0
weekday_4                0
weekday_5                0
weekday_6                0
season_Fall              0
season_Spring            0
season_Summer            0
season_Winter            0
weathersit_Clear         0
weathersit_Heavy Rain    0
weathersit_Light Rain    0
weathersit_Mist          0
time_of_day_Evening      0
time_of_day_Morning      0
time_of_day_Night        0
dtype: int64

In [21]:
#verificar registros duplicados 
data_copy.duplicated().sum()

np.int64(42)

In [22]:
#eliminar registros replicados
data_copy = data_copy.drop_duplicates()
print(data_copy.shape)

(17337, 23)


In [23]:
#SEPARCION DE VARIABLE OBJETIVO DE INDEPENDIENTES   ---------------------------------------------- REGRESION CON REGULACION LASSO
train_lasso, test_lasso = train_test_split(data_copy, test_size=0.2, random_state=77)
x_train_lasso = train_lasso.drop(['cnt'], axis = 1)
y_train_lasso = train_lasso['cnt']
#ESTANDARIZACIÓN DE LOS DATOS
columns = x_train_lasso.columns  
#creacion de objeto StandardScaler()
scaler = StandardScaler()
x_train_lasso = pd.DataFrame(scaler.fit_transform(x_train_lasso), columns = columns)
#BUSQUEDA DE HIPERPARAMETRO Y ENTRENAMIENTO
kfold = KFold(n_splits=10, shuffle= True, random_state=77)
#creacion objeto clase lasso
lasso = Lasso()
#definición de busqueda de hiperparametro
valores_alpha = [1, 2, 3, 4, 5]
param_grid_lasso = {'alpha': valores_alpha}
grid_lasso = GridSearchCV(lasso, param_grid_lasso, cv = kfold, n_jobs = -1, scoring= 'neg_mean_squared_error')
grid_lasso.fit(x_train_lasso, y_train_lasso)


In [24]:
#mostrar mejor valor alfa 
mejor_modelo_lasso = grid_lasso.best_estimator_
x_test_lasso = test_lasso.drop(['cnt'], axis = 1)
y_test_lasso = test_lasso['cnt']
x_test_lasso = pd.DataFrame(scaler.transform(x_test_lasso), columns= columns)

print("mejor parametro lasso:", grid_lasso.best_params_)
list(zip(x_train_lasso.columns, mejor_modelo_lasso.coef_))

mejor parametro lasso: {'alpha': 1}


[('temp', np.float64(39.191981038398495)),
 ('atemp', np.float64(21.842442433534956)),
 ('hum', np.float64(-27.64610970016669)),
 ('windspeed', np.float64(-0.0)),
 ('weekday_0', np.float64(-2.426884847183269)),
 ('weekday_1', np.float64(-0.5682387396568973)),
 ('weekday_2', np.float64(-0.0)),
 ('weekday_3', np.float64(0.0)),
 ('weekday_4', np.float64(0.0)),
 ('weekday_5', np.float64(0.5062339724906544)),
 ('weekday_6', np.float64(0.6723201777679966)),
 ('season_Fall', np.float64(22.548732507426287)),
 ('season_Spring', np.float64(8.41304080787491)),
 ('season_Summer', np.float64(-2.302852369371599)),
 ('season_Winter', np.float64(-2.3308152040124495)),
 ('weathersit_Clear', np.float64(1.1937844567950389)),
 ('weathersit_Heavy Rain', np.float64(0.0)),
 ('weathersit_Light Rain', np.float64(-10.59687737334932)),
 ('weathersit_Mist', np.float64(-0.0)),
 ('time_of_day_Evening', np.float64(39.64565520338253)),
 ('time_of_day_Morning', np.float64(0.0)),
 ('time_of_day_Night', np.float64(-48.6

In [25]:
#predicciones
y_pred_lasso = mejor_modelo_lasso.predict(x_test_lasso)
#manejor de errores
rmse_lasso = root_mean_squared_error(y_test_lasso, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test_lasso, y_pred_lasso)
r2_lasso = r2_score(y_test_lasso, y_pred_lasso)

print(y_pred_lasso)
print(["RMSE:", rmse_lasso], ["MAE:", mae_lasso], ["R2:", r2_lasso])

[ 88.06062622 142.09409283 246.10601738 ... 245.07867842 258.47375809
 183.58233138]
['RMSE:', np.float64(139.73457240621644)] ['MAE:', np.float64(103.60265964565383)] ['R2:', 0.41915267678849677]


In [26]:
#SEPARCION DE VARIABLE OBJETIVO DE INDEPENDIENTES Y SEPARACIÓN TEST ENTRENAMIENTO ---------------------------------------------- POLINOMIAL
x_pol = data_copy.drop(['cnt'], axis=1)
y_pol = data_copy['cnt']
x_train_pol, x_test_pol, y_train_pol, y_test_pol = train_test_split(x_pol, y_pol, test_size=0.2, random_state=77)
#BUSQUEDA DE HIPER PARAMETRO
polynomial_regression = make_pipeline(
    PolynomialFeatures(),
    RobustScaler(),
    LinearRegression()
)
kfold = KFold(n_splits=10, shuffle= True, random_state=77)
#espacio de busqueda 
valores_busqueda = [2,3]
param_grid_pol = {'polynomialfeatures__degree': valores_busqueda}
grid_pol = GridSearchCV(polynomial_regression, param_grid_pol, cv = kfold, n_jobs = 1, scoring= 'neg_mean_squared_error')
grid_pol.fit(x_train_pol, y_train_pol)

print("mejor parametro polinomial: ", grid_pol.best_params_)

mejor parametro polinomial:  {'polynomialfeatures__degree': 2}


In [27]:
mejor_modelo_pol = grid_pol.best_estimator_
#prediciones
y_pred_pol = mejor_modelo_pol.predict(x_test_pol)
#manejo de errores
rmse_pol = root_mean_squared_error(y_test_pol, y_pred_pol)
mae_pol = mean_absolute_error(y_test_pol, y_pred_pol)
r2_pol = r2_score(y_test_pol, y_pred_pol)

print(y_pred_pol)
print(["RMSE:", rmse_pol], ["MAE",mae_pol], ["R2:", r2_pol])

[ 82.77734375 190.33398438 244.78710938 ... 239.39648438 247.38085938
 216.68164062]
['RMSE:', np.float64(134.64167231841884)] ['MAE', np.float64(98.64871740286188)] ['R2:', 0.46072132634337837]


In [28]:
print("ERRORES LASSO:")
print(["RMSE:", rmse_lasso], ["MAE",mae_lasso], ["R2:", r2_lasso])
print("ERRORES POLYNOMIAL:")
print(["RMSE:", rmse_pol], ["MAE:", mae_pol], ["R2:", r2_pol])

ERRORES LASSO:
['RMSE:', np.float64(139.73457240621644)] ['MAE', np.float64(103.60265964565383)] ['R2:', 0.41915267678849677]
ERRORES POLYNOMIAL:
['RMSE:', np.float64(134.64167231841884)] ['MAE:', np.float64(98.64871740286188)] ['R2:', 0.46072132634337837]
