In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
file_path = './data/javier_dastas_sales_predictions.csv'  # Update the path if needed
data = pd.read_csv(file_path)

# print(data.head(3))

# Ensure the 'date' column is of datetime type
data['date'] = pd.to_datetime(data['date'])

# Dont need the 'year' and 'month' columns based on 'date'

# Drop the 'date' column
columns_to_drop = ['state_holiday', 'school_holiday', 'index', 'date']
data = data.drop(columns=columns_to_drop)

# Filter only the days when the store was open
data_open = data[data['open'] == 1].reset_index(drop=True)

# Drop the 'open' column as it's no longer needed
data_open = data_open.drop(columns=['open'])

# Print the processed data
print("Processed data (stores open only):")
print(data_open.head())

# Save the preprocessed dataset to a new CSV file
output_file = './data/real_data_preprocessed_sales.csv'
data_open.to_csv(output_file, index=False)
print(f"\nPreprocessed dataset saved to '{output_file}'.")

Processed data (stores open only):
   store_ID  day_of_week  nb_customers_on_day  promotion        sales
0       404            3                  657          1  5882.174829
1       683            2                  862          0  8062.242394
2       920            3                  591          1  5783.027947
3       758            4                  569          0  3930.128802
4       563            1                  321          1  3556.917393

Preprocessed dataset saved to './data/real_data_preprocessed_sales.csv'.


  data['date'] = pd.to_datetime(data['date'])


In [7]:
import pandas as pd
from scipy.stats import zscore

# Load the preprocessed dataset
file_path = './data/real_data_preprocessed_sales.csv'  # Update the path if needed
data = pd.read_csv(file_path)

# Calculate the z-scores for the 'sales' column
data['z_score'] = zscore(data['sales'])

# Filter out rows with z-scores greater than 3 or less than -3
data_without_outliers = data[data['z_score'].abs() <= 3].reset_index(drop=True)

# Drop the 'z_score' column as it's no longer needed
data_without_outliers = data_without_outliers.drop(columns=['z_score'])

# Save the dataset without outliers to a new CSV file
output_file = './data/real_data_preprocessed_sales_without_outliers.csv'
data_without_outliers.to_csv(output_file, index=False)

# Print summary information
print(f"Original dataset size: {len(data)}")
print(f"Dataset size without outliers: {len(data_without_outliers)}")
print(f"\nDataset without outliers saved to '{output_file}'.")

Original dataset size: 59105
Dataset size without outliers: 58293

Dataset without outliers saved to './data/real_data_preprocessed_sales_without_outliers.csv'.


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load the preprocessed dataset
# file_path = './data/preprocessed_sales.csv'  # Update path if needed
file_path = './data/real_data_preprocessed_sales_without_outliers.csv'  # Update path if needed
data = pd.read_csv(file_path)

# Separate features (X) and target (y)
X = data.drop(columns=['sales'])  # Drop the target variable
y = data['sales']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42, 
                                 n_jobs=-1, 
                                 min_samples_leaf=1, 
                                 min_samples_split=5,
                                 n_estimators=300,
                                bootstrap=True,
                                max_depth=None)
# {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Feature importance
feature_importance = rf_model.feature_importances_
important_features = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance}).sort_values(by='Importance', ascending=False)

# Compare predicted values with actual values
exact_matches = np.sum(np.isclose(y_test, y_pred, atol=1e-5))  # Considering floating-point precision
total_test_rows = len(y_test)

# Output the evaluation metrics
print("Model Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

# print(f"\nTotal test rows: {total_test_rows}")
# print(f"Exact matches in predictions: {exact_matches}")

# Display feature importance
print("\nFeature Importance:")
print(important_features)

# Save the model to a file for later use
# import joblib
# joblib.dump(rf_model, './model/random_forest_sales_model.pkl')
print("\nTrained RandomForest model saved as 'random_forest_sales_model.pkl'.")

Model Evaluation Metrics:
Mean Absolute Error (MAE): 614.27
Mean Squared Error (MSE): 708252.12
Root Mean Squared Error (RMSE): 841.58
R-squared (R2): 0.89

Feature Importance:
               Feature  Importance
2  nb_customers_on_day    0.796861
0             store_ID    0.121787
3            promotion    0.057401
1          day_of_week    0.023950

Trained RandomForest model saved as 'random_forest_sales_model.pkl'.
