In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [4]:
df = pd.read_csv("data/sales.csv")
df

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday,Sales
0,0,625,3,2013-11-06,641,1,1,0,0,7293
1,1,293,2,2013-07-16,877,1,1,0,1,7060
2,2,39,4,2014-01-23,561,1,1,0,0,4565
3,3,676,4,2013-09-26,1584,1,1,0,0,6380
4,4,709,3,2014-01-22,1477,1,1,0,0,11647
...,...,...,...,...,...,...,...,...,...,...
640835,712040,674,6,2014-09-20,611,1,0,0,0,4702
640836,712041,1014,4,2015-01-15,1267,1,1,0,0,12545
640837,712042,135,6,2015-06-20,595,1,0,0,0,5823
640838,712043,810,1,2014-08-18,599,1,1,0,1,7986


In [4]:
df.isnull().sum()

True_index             0
Store_ID               0
Day_of_week            0
Date                   0
Nb_customers_on_day    0
Open                   0
Promotion              0
State_holiday          0
School_holiday         0
Sales                  0
dtype: int64

In [5]:
df.duplicated().sum()  

0

In [6]:
df.dtypes

True_index              int64
Store_ID                int64
Day_of_week             int64
Date                   object
Nb_customers_on_day     int64
Open                    int64
Promotion               int64
State_holiday          object
School_holiday          int64
Sales                   int64
dtype: object

In [5]:
df["Date"] = pd.to_datetime(df["Date"])

In [6]:
df = df.sort_values(by="Date")
df

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday,Sales
57790,64201,760,2,2013-01-01,0,0,0,a,1,0
273970,304410,178,2,2013-01-01,0,0,0,a,1,0
623089,692372,115,2,2013-01-01,0,0,0,a,1,0
313880,348682,917,2,2013-01-01,0,0,0,a,1,0
230463,256049,987,2,2013-01-01,0,0,0,a,1,0
...,...,...,...,...,...,...,...,...,...,...
596971,663408,408,5,2015-07-31,639,1,1,0,1,7985
73682,81811,73,5,2015-07-31,537,1,1,0,1,6026
521899,579813,71,5,2015-07-31,869,1,1,0,1,11545
121063,134544,872,5,2015-07-31,794,1,1,0,1,9747


In [7]:
# Crear nuevas columnas a partir de la columna Date
df['year'] = df['Date'].dt.year  # Año
df['month'] = df['Date'].dt.month  # Mes
df['day'] = df['Date'].dt.day  # Día del mes
df['week'] = df['Date'].dt.isocalendar().week  # Semana del año
df['dayofweek'] = df['Date'].dt.dayofweek  # Día de la semana (0=lunes, 6=domingo)
df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)  # Fin de semana (1=sábado/domingo)

In [10]:
df.head(3)

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday,Sales,year,month,day,week,dayofweek,is_weekend
57790,64201,760,2,2013-01-01,0,0,0,a,1,0,2013,1,1,1,1,0
273970,304410,178,2,2013-01-01,0,0,0,a,1,0,2013,1,1,1,1,0
623089,692372,115,2,2013-01-01,0,0,0,a,1,0,2013,1,1,1,1,0


In [8]:
df = pd.get_dummies(df, columns=['State_holiday'], drop_first=True)

In [9]:
df.columns = df.columns.str.lower()

In [13]:
# Características (X) y la etiqueta (y)
features = df.drop(columns=['sales', 'date'], axis=1)
target = df['sales']
target = target.squeeze()  # Etiqueta que deseas predecir


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

# Simulación de un DataFrame
# df = pd.read_csv("ruta_a_tus_datos.csv")

# Preparación de los datos
features = df.drop(columns=['sales', 'date'], axis=1)
target = df['sales'].squeeze()  # Etiqueta que deseas predecir

# Función para eliminar outliers
def remove_outliers(df, columns, z_threshold=3):
    columns = [col for col in columns if col in df.columns]
    return df[(np.abs(stats.zscore(df[columns])) < z_threshold).all(axis=1)]


# Modelos a usar
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'KNN': KNeighborsRegressor()
}

# Eliminar outliers solo una vez
features_no_outliers = remove_outliers(features)
target_no_outliers = target[features_no_outliers.index]

# Dividir los datos en train y test una vez (sin outliers)
X_train, X_test, y_train, y_test = train_test_split(features_no_outliers, target_no_outliers, test_size=0.2, random_state=42)

# Preparar resultados en un DataFrame
results = []

# Escaladores
scalers = {
    'Standardization': StandardScaler(),
    'Normalization': MinMaxScaler(),
    'No Scaling': None  # Para el caso de no aplicar escalado
}

# Iterar a través de los diferentes conjuntos de características y métodos de escalado
for scaling_name, scaler in scalers.items():
    for feature_set_name, features_list in feature.items():
        # Verificar si el subconjunto no está vacío
        if not features_list:
            print(f"Skipping feature set '{feature_set_name}' because it is empty.")
            continue
        
        # Seleccionar subconjunto de características
        X_train_subset = X_train[features_list]
        X_test_subset = X_test[features_list]

        # Aplicar escalado solo si es necesario
        if scaler:
            X_train_scaled = scaler.fit_transform(X_train_subset)
            X_test_scaled = scaler.transform(X_test_subset)
        else:
            X_train_scaled = X_train_subset
            X_test_scaled = X_test_subset
        
        # Entrenar y evaluar los modelos
        for model_name, model in models.items():
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
            
            # Calcular MSE, MAE y R²
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            
            # Guardar los resultados
            results.append({
                'Model': model_name,
                'Scaling': scaling_name,
                'Feature Set': feature_set_name,
                'MSE': mse,
                'MAE': mae,
                'R²': r2
            })

# Convertir resultados a DataFrame
results_df = pd.DataFrame(results)

# Imprimir resumen de resultados
print(results_df)

# Encontrar el mejor modelo según los criterios
best_model_mse = results_df.loc[results_df['MSE'].idxmin()]
best_model_mae = results_df.loc[results_df['MAE'].idxmin()]
best_model_r2 = results_df.loc[results_df['R²'].idxmax()]

print("\nBest performing model based on MSE:")
print(best_model_mse)

print("\nBest performing model based on MAE:")
print(best_model_mae)

print("\nBest performing model based on R²:")
print(best_model_r2)




Skipping feature set 'Numeric Only' because it is empty.


In [10]:
features = df.drop(columns=['sales', 'date'], axis=1)
target = df['sales'].squeeze()  # Etiqueta que deseas predecir

In [11]:
features.dtypes

true_index              int64
store_id                int64
day_of_week             int64
nb_customers_on_day     int64
open                    int64
promotion               int64
school_holiday          int64
year                    int32
month                   int32
day                     int32
week                   UInt32
dayofweek               int32
is_weekend              int64
state_holiday_a          bool
state_holiday_b          bool
state_holiday_c          bool
dtype: object

In [12]:
target.dtypes

dtype('int64')

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# Ya no eliminamos outliers

# Características y target (proporcionadas)
features = df.drop(columns=['sales', 'date'], axis=1)  # Asegúrate que 'df' contenga tus datos
target = df['sales'].squeeze()  # Etiqueta que deseas predecir

# Definir conjuntos de características para probar
all_features = features.columns.tolist()
numeric_features = [col for col in features.columns if features[col].dtype in [np.int64, np.int32, np.uint32]]

feature_sets = {
    'All Features': all_features,
    'Numeric Only': numeric_features
}

# Definir los modelos a usar
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'KNN': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42)
}

# Dividir los datos en train y test (sin outliers)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Preparar resultados en un DataFrame
results = []

# Escaladores
scalers = {
    'Standardization': StandardScaler(),
    'Normalization': MinMaxScaler(),
    'No Scaling': None  # Para el caso de no aplicar escalado
}

# Iterar a través de los diferentes conjuntos de características y métodos de escalado
for scaling_name, scaler in scalers.items():
    for feature_set_name, features_list in feature_sets.items():
        # Verificar si el subconjunto no está vacío
        if not features_list:
            print(f"Skipping feature set '{feature_set_name}' because it is empty.")
            continue
        
        # Seleccionar subconjunto de características
        X_train_subset = X_train[features_list]
        X_test_subset = X_test[features_list]

        # Aplicar escalado solo si es necesario
        if scaler:
            X_train_scaled = scaler.fit_transform(X_train_subset)
            X_test_scaled = scaler.transform(X_test_subset)
        else:
            X_train_scaled = X_train_subset
            X_test_scaled = X_test_subset
        
        # Entrenar y evaluar los modelos
        for model_name, model in models.items():
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
            
            # Calcular MSE, MAE y R²
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            
            # Guardar los resultados
            results.append({
                'Model': model_name,
                'Scaling': scaling_name,
                'Feature Set': feature_set_name,
                'MSE': mse,
                'MAE': mae,
                'R²': r2
            })

# Convertir resultados a DataFrame
results_df = pd.DataFrame(results)

# Imprimir resumen de resultados
print(results_df)

# Encontrar el mejor modelo según los criterios
best_model_mse = results_df.loc[results_df['MSE'].idxmin()]
best_model_mae = results_df.loc[results_df['MAE'].idxmin()]
best_model_r2 = results_df.loc[results_df['R²'].idxmax()]

print("\nBest performing model based on MSE:")
print(best_model_mse)

print("\nBest performing model based on MAE:")
print(best_model_mae)

print("\nBest performing model based on R²:")
print(best_model_r2)

# Opcional: Guardar los resultados en un CSV
results_df.to_csv('model_comparison_results.csv', index=False)


KeyboardInterrupt: 

In [15]:
#display all rows
pd.set_option('display.max_rows', None)

results_df.sort_values(by = "R²", ascending = False)

Unnamed: 0,Model,Scaling,Feature Set,MSE,MAE,R²
7,Decision Tree,Normalization,All Features,1618566.0,744.424162,0.890623
13,Decision Tree,No Scaling,All Features,1619085.0,744.541391,0.890588
1,Decision Tree,Standardization,All Features,1619263.0,744.947179,0.890576
4,Decision Tree,Standardization,Numeric Only,1653845.0,751.471803,0.888239
16,Decision Tree,No Scaling,Numeric Only,1655522.0,751.287864,0.888125
10,Decision Tree,Normalization,Numeric Only,1656748.0,751.680162,0.888043
5,KNN,Standardization,Numeric Only,2095572.0,970.320486,0.858388
2,KNN,Standardization,All Features,2110319.0,974.817015,0.857392
6,Linear Regression,Normalization,All Features,2127920.0,980.039665,0.856202
0,Linear Regression,Standardization,All Features,2127920.0,980.040017,0.856202
