In [1]:
%cd ..

/Users/paolaalejandraleonguarneros/Documents/GitHub/MLOps


In [2]:
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [4]:
bici = 'data/processed/bike_sharing_cleaned.csv'
df = pd.read_csv(bici)

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17355 entries, 0 to 17354
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17088 non-null  float64
 1   dteday      17161 non-null  object 
 2   season      17352 non-null  float64
 3   yr          17351 non-null  float64
 4   mnth        17351 non-null  float64
 5   hr          16983 non-null  float64
 6   holiday     17355 non-null  int64  
 7   weekday     17353 non-null  float64
 8   workingday  17348 non-null  float64
 9   weathersit  17355 non-null  int64  
 10  temp        17355 non-null  float64
 11  atemp       17355 non-null  float64
 12  hum         17355 non-null  float64
 13  windspeed   17355 non-null  float64
 14  casual      17355 non-null  int64  
 15  registered  17355 non-null  int64  
 16  cnt         17355 non-null  int64  
dtypes: float64(11), int64(5), object(1)
memory usage: 2.3+ MB
None


In [6]:
print("Se cambiará el tipo de dato de las columnas season, mnth, weekday y weathersit a categóricas.")

cat_cols   = ["season", "mnth", "weekday", "weathersit"]   # categóricas
int_cols   = ["yr", "holiday", "workingday", "cnt", "registered", "casual","instant","mixed_type_col"]        # enteras
float_cols = ["temp", "atemp", "hum", "windspeed"]         # continuas

#Convertir a numérico correctamente
cols = [c for c in (cat_cols + int_cols + float_cols) if c in df.columns]
df[cols] = df[cols].apply(pd.to_numeric, errors="coerce")

season_dtype     = CategoricalDtype(categories=[1,2,3,4], ordered=False)
mnth_dtype       = CategoricalDtype(categories=list(range(1,13)), ordered=False)
weekday_dtype    = CategoricalDtype(categories=list(range(0,7)), ordered=False)
weathersit_dtype = CategoricalDtype(categories=[1,2,3,4], ordered=False)

dtype_map = {
    "season": season_dtype,
    "mnth": mnth_dtype,
    "weekday": weekday_dtype,
    "weathersit": weathersit_dtype,
}

for c, ctype in dtype_map.items():
    if c in df.columns:
        df[c] = df[c].astype(ctype)


for c in int_cols:
    if c in df.columns:
        df[c] = df[c].astype("Int64")


for c in float_cols:
    if c in df.columns:
       df[c] = df[c].astype("float64")

print(df.info())

Se cambiará el tipo de dato de las columnas season, mnth, weekday y weathersit a categóricas.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17355 entries, 0 to 17354
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   instant     17088 non-null  Int64   
 1   dteday      17161 non-null  object  
 2   season      17350 non-null  category
 3   yr          17351 non-null  Int64   
 4   mnth        17351 non-null  category
 5   hr          16983 non-null  float64 
 6   holiday     17355 non-null  Int64   
 7   weekday     17353 non-null  category
 8   workingday  17348 non-null  Int64   
 9   weathersit  17355 non-null  category
 10  temp        17355 non-null  float64 
 11  atemp       17355 non-null  float64 
 12  hum         17355 non-null  float64 
 13  windspeed   17355 non-null  float64 
 14  casual      17355 non-null  Int64   
 15  registered  17355 non-null  Int64   
 16  cnt         17355 non-null  Int64   

In [7]:
df_new = df.drop(['instant','casual','registered','dteday'], axis=1)
print("Se eliminaron las columnas instant, casual, registered y dteday por ser redundantes para el modelo")
#Convertimos a números a las variables categóricas
df_new = pd.get_dummies(df_new, drop_first=True)
#Separación de los datos en train, validation y test
df_train, df_vt = train_test_split(df_new, train_size = 0.70, test_size = 0.30, random_state = 333)
df_valid, df_test = train_test_split(df_vt, train_size = 0.50, test_size = 0.50, random_state = 333)
#Reescalaremos las variables del conjunto de entrenamiento
scaler = MinMaxScaler()
df_train[['temp', 'atemp', 'hum', 'windspeed','cnt']] = scaler.fit_transform(df_train[['temp', 'atemp', 'hum', 'windspeed','cnt']])
print("Se escalaron las variables temp, atemp, hum, windspeed y cnt")
cols_to_scale = ['temp', 'atemp', 'hum', 'windspeed', 'cnt']

# Asegurar que las columnas de validación y test sean float antes de asignar los valores escalados
for df_ in (df_valid, df_test):
    for c in cols_to_scale:
        if c in df_.columns:
            df_.loc[:, c] = df_[c].astype("float64")

df_valid.loc[:, cols_to_scale] = scaler.transform(df_valid[cols_to_scale])
df_test.loc[:, cols_to_scale]  = scaler.transform(df_test[cols_to_scale])

print("Se escalaron las variables temp, atemp, hum, windspeed y cnt en validación y test.")

import pickle

scaler_path = "models/minmax_scaler.pickle"
with open(scaler_path, "wb") as f:
    pickle.dump(scaler, f)

print(f"Scaler guardado en: {scaler_path}")

Se eliminaron las columnas instant, casual, registered y dteday por ser redundantes para el modelo
Se escalaron las variables temp, atemp, hum, windspeed y cnt
