In [1]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath('../scripts'))

from Classical_MLAs import SalesPredictor

  from .autonotebook import tqdm as notebook_tqdm


Data Preprocessing
---
1. feature_engineering()
2. handle_missing_values()
3. encode_categorical()
4. scale_numeric_features()
5. see X_train_scaled, X_val_scaled, y_train, y_val 


In [2]:

# Load the datasets
# Load datasets
train_file = '../Data/train.csv'  # Adjust this path as necessary
df_test = pd.read_csv('../Data/test.csv')  # Load the test CSV
df_sample_submission = pd.read_csv('../Data/sample_submission.csv')  # Load the sample submission file
submission_file = '../Data/submission.csv'  # Specify your submission file path
# Merge test dataset with sample submission to align IDs
df_test_merge = df_test.merge(df_sample_submission[['Id', 'Sales']], on='Id', how='left')
target_col = 'Sales'  # Target column is 'Sales'


In [4]:
sales_predictor = SalesPredictor(train_file, df_test, 'Sales')
rf_predictions, rf_model = sales_predictor .run_random_forest()

Loaded train data shape: (1017209, 9)
Train data types:
 Store             int64
DayOfWeek         int64
Date             object
Sales             int64
Customers         int64
Open              int64
Promo             int64
StateHoliday     object
SchoolHoliday     int64
dtype: object
Missing values in train data:
 Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64
Loaded test data shape: (41088, 8)


In [None]:
sales_predictor.visualize_shap_feature_importance(rf_model)

In [None]:
sales_predictor.visualize_predictions(sales_predictor.y_train, rf_predictions)

In [2]:

import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Specify data types while loading the CSV
dtype_dict = {
    'Store': 'int64',
    'DayOfWeek': 'int64',
    'Date': 'object',
    'Sales': 'int64',
    'Customers': 'int64',
    'Open': 'int64',
    'Promo': 'int64',
    'StateHoliday': 'str',  # Use str to handle mixed types
    'SchoolHoliday': 'int64'
}

def load_and_merge_data(train_path, test_df, target_column):
    train_df = pd.read_csv(train_path, dtype=dtype_dict, low_memory=False)
    
    print("Loaded train data shape:", train_df.shape)
    print("Train data types:\n", train_df.dtypes)

    # Check for missing values
    print("Missing values in train data:\n", train_df.isnull().sum())

    # Define X and y
    X = train_df.drop(columns=[target_column])
    y = train_df[target_column]
    
    # Ensure test_df is already a DataFrame
    X_test = test_df.drop(columns=['Sales'], errors='ignore')  # Drop Sales if it exists
    print("Loaded test data shape:", X_test.shape)

    return X, y, X_test

def preprocess_data(X_train, X_test):
    # Ensure both datasets have the same columns
    missing_cols = set(X_train.columns) - set(X_test.columns)
    
    # If any columns are missing in the test set, add them with default values
    for col in missing_cols:
        X_test[col] = 0  # or use an appropriate default value, such as the mean or median
    
    # Identify numeric and categorical columns
    numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X_train.select_dtypes(include=['object']).columns
    
    # Preprocessing for numeric data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    # Bundle preprocessing for numeric and categorical data
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)])
    
    # Fit the preprocessor on the training data and transform both training and test data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # Return processed data and feature names
    return X_train_processed, X_test_processed, preprocessor.get_feature_names_out()
def run_random_forest(X_train, y_train, X_test):
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_predictions = rf_model.predict(X_test)
    return rf_predictions, rf_model

def run_gradient_boosting(X_train, y_train, X_test):
    gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_model.fit(X_train, y_train)
    gb_predictions = gb_model.predict(X_test)
    return gb_predictions, gb_model

def run_xgboost(X_train, y_train, X_test):
    xg_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
    xg_model.fit(X_train, y_train)
    xg_predictions = xg_model.predict(X_test)
    return xg_predictions, xg_model

def visualize_feature_importance(model, feature_names):
    if hasattr(model, 'feature_importances_'):
        feature_importances = model.feature_importances_
    else:
        feature_importances = model.get_score(importance_type='weight')
        feature_importances = np.array([feature_importances.get(name, 0) for name in feature_names])

    indices = np.argsort(feature_importances)[::-1]
    
    plt.figure(figsize=(10, 6))
    plt.title("Feature Importances")
    plt.barh(range(len(feature_importances)), feature_importances[indices], align="center")
    plt.yticks(range(len(feature_importances)), [feature_names[i] for i in indices])
    plt.xlabel("Relative Importance")
    plt.show()

def visualize_predictions(y_true, y_pred):
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_true, y=y_pred)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
    plt.xlabel("Actual Sales")
    plt.ylabel("Predicted Sales")
    plt.title("Actual vs Predicted Sales")
    plt.show()

def run_all_models(train_path, test_df, submission_file, target_col):
    X_train, y_train, X_test = load_and_merge_data(train_path, test_df, target_col)
    
    # Preprocess data
    X_train, X_test, feature_names = preprocess_data(X_train, X_test)
    
    # Running Random Forest
    print("Running Random Forest...")
    rf_predictions, rf_model = run_random_forest(X_train, y_train, X_test)
    visualize_feature_importance(rf_model, feature_names)

    # Running Gradient Boosting
    print("Running Gradient Boosting...")
    gb_predictions, gb_model = run_gradient_boosting(X_train, y_train, X_test)
    visualize_feature_importance(gb_model, feature_names)

    # Running XGBoost
    print("Running XGBoost...")
    xg_predictions, xg_model = run_xgboost(X_train, y_train, X_test)
    visualize_feature_importance(xg_model, feature_names)

    # Save predictions to submission file
    submission_df = pd.DataFrame({'Id': test_df['Id'], 'Sales': rf_predictions})  # You can choose which model's predictions to save
    submission_df.to_csv(submission_file, index=False)


# Load the datasets
# Load datasets
train_file = '../Data/train.csv'  # Adjust this path as necessary
df_test = pd.read_csv('../Data/test.csv')  # Load the test CSV
df_sample_submission = pd.read_csv('../Data/sample_submission.csv')  # Load the sample submission file
submission_file = '../Data/submission.csv'  # Specify your submission file path
# Merge test dataset with sample submission to align IDs
df_test_merge = df_test.merge(df_sample_submission[['Id', 'Sales']], on='Id', how='left')
target_col = 'Sales'  # Target column is 'Sales'
run_all_models(train_file, df_test_merge, submission_file, 'Sales') 


Loaded train data shape: (1017209, 9)
Train data types:
 Store             int64
DayOfWeek         int64
Date             object
Sales             int64
Customers         int64
Open              int64
Promo             int64
StateHoliday     object
SchoolHoliday     int64
dtype: object
Missing values in train data:
 Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64
Loaded test data shape: (41088, 8)
Running Random Forest...
