## Using Cross Validation

### 1. Import Libraries

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

### 2. Comparing XGBoost, Random Forest, Linear Regression

In [39]:
# --- Load Preprocessed Data ---
# We are loading the enhanced preprocessed data.
print("Loading enhanced preprocessed training and test data...")
train_df = pd.read_csv('preprocessed_train_v2.csv')
test_df = pd.read_csv('preprocessed_test_v2.csv')

# Also load the original test identifiers to build the submission file.
original_test_identifiers = pd.read_csv('test_AbJTz2l.csv')[['Item_Identifier', 'Outlet_Identifier']]

# --- Define Features and Target ---
y_train_log = np.log1p(train_df['Item_Outlet_Sales'])
X_train = train_df.drop('Item_Outlet_Sales', axis=1)
X_test = test_df.copy()

# --- Define Models to Compare ---
# We will compare a few different types of regression models.
models = {}

# We will use a try-except block to gracefully handle the ImportError with XGBoost.
try:
    models['XGBoost'] = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.05,
                                    max_depth=5, subsample=0.8, colsample_bytree=0.8, random_state=42)
except ImportError:
    print("XGBoost with sklearn wrapper is not available, skipping this model.")
    pass

models['RandomForest'] = RandomForestRegressor(n_estimators=1000, max_depth=5, min_samples_leaf=100,
                                     n_jobs=-1, random_state=42)
models['LinearRegression'] = LinearRegression()


# --- K-Fold Cross-Validation ---
# We will use 5-fold cross-validation to compare the models fairly.
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Dictionary to store the mean RMSE for each model.
rmse_scores = {}
trained_models = {}

print("\nStarting K-Fold Cross-Validation for each model...")

for name, model in models.items():
    print(f"\nTraining and evaluating {name}...")
    fold_scores = []
    
    # Iterate through each fold
    for train_index, val_index in kf.split(X_train):
        # Split the data into training and validation sets for this fold.
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_log_fold, y_val_log_fold = y_train_log.iloc[train_index], y_train_log.iloc[val_index]
        
        # Train the model on the current fold's training data.
        model.fit(X_train_fold, y_train_log_fold)
        
        # Make predictions on the validation data.
        val_preds_log = model.predict(X_val_fold)
        
        # Inverse transform the predictions to the original scale to calculate RMSE.
        val_preds = np.expm1(val_preds_log)
        y_val = np.expm1(y_val_log_fold)
        
        # Calculate the Root Mean Squared Error (RMSE) for this fold.
        rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        fold_scores.append(rmse)
    
    # Store the mean RMSE and the final trained model.
    rmse_scores[name] = np.mean(fold_scores)
    trained_models[name] = model

    print(f"-> {name} completed. Mean RMSE: {rmse_scores[name]:.4f}")

# --- Select the Best Model and Make Predictions ---
# Find the model with the lowest (best) mean RMSE.
best_model_name = min(rmse_scores, key=rmse_scores.get)

print(f"\nBest model: {best_model_name} with a mean RMSE of {rmse_scores[best_model_name]:.4f}")

# Train the best model on the entire dataset one last time.
print("Training the best model on the full training data...")
best_model = trained_models[best_model_name]
best_model.fit(X_train, y_train_log)
test_predictions_log = best_model.predict(X_test)
    
test_predictions = np.expm1(test_predictions_log)
print("Predictions generated.")

# --- Create Submission File ---
print(f"\nCreating submission file 'submission_best_model.csv'...")
submission = pd.DataFrame({
    'Item_Identifier': original_test_identifiers['Item_Identifier'],
    'Outlet_Identifier': original_test_identifiers['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})

# Save the submission file without the index.
submission.to_csv('submission_best_model.csv', index=False)
print("Submission file 'submission_best_model.csv' created successfully!")


Loading enhanced preprocessed training and test data...
XGBoost with sklearn wrapper is not available, skipping this model.

Starting K-Fold Cross-Validation for each model...

Training and evaluating RandomForest...
-> RandomForest completed. Mean RMSE: 1237.7706

Training and evaluating LinearRegression...
-> LinearRegression completed. Mean RMSE: 1121.5911

Best model: LinearRegression with a mean RMSE of 1121.5911
Training the best model on the full training data...
Predictions generated.

Creating submission file 'submission_best_model.csv'...
Submission file 'submission_best_model.csv' created successfully!


### 3. Improving Linear Regression Model

In [40]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error

# --- Load Preprocessed Data ---
# We are loading the enhanced preprocessed data.
print("Loading enhanced preprocessed training and test data...")
train_df = pd.read_csv('preprocessed_train_v2.csv')
test_df = pd.read_csv('preprocessed_test_v2.csv')

# Also load the original test identifiers to build the submission file.
original_test_identifiers = pd.read_csv('test_AbJTz2l.csv')[['Item_Identifier', 'Outlet_Identifier']]

# --- Define Features and Target ---
y_train_log = np.log1p(train_df['Item_Outlet_Sales'])
X_train = train_df.drop('Item_Outlet_Sales', axis=1)
X_test = test_df.copy()

# --- Define Models to Compare ---
# We will compare three different types of regularized linear models
# and tune their hyperparameters to find the best fit.
models_to_tune = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet()
}

# Define a common parameter grid for the regularization strength (alpha).
# We'll test a wide range of values to find the optimal one for each model.
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

# --- K-Fold Cross-Validation and Hyperparameter Tuning ---
print("\nStarting K-Fold Cross-Validation and Hyperparameter Tuning for each model...")

# Dictionary to store the best mean RMSE for each model.
best_rmse_scores = {}
best_estimators = {}

for name, model in models_to_tune.items():
    print(f"\nTuning {name} model...")
    
    # Set up GridSearchCV for the current model.
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=5, # Using 5-fold cross-validation
        verbose=1,
        n_jobs=-1
    )
    
    # Fit the grid search to the data to find the best alpha.
    grid_search.fit(X_train, y_train_log)
    
    # Get the best estimator and its cross-validation score.
    best_estimator = grid_search.best_estimator_
    best_score = np.sqrt(abs(grid_search.best_score_))
    
    # Store the best estimator and its corresponding RMSE.
    best_estimators[name] = best_estimator
    best_rmse_scores[name] = best_score
    
    print(f"-> {name} completed. Best Parameters: {grid_search.best_params_}")
    print(f"-> Best cross-validation RMSE: {best_score:.4f}")

# --- Select the Overall Best Model and Make Predictions ---
# Find the model with the lowest (best) mean RMSE from all tuned models.
best_model_name = min(best_rmse_scores, key=best_rmse_scores.get)
best_model = best_estimators[best_model_name]

print(f"\nOverall best model is {best_model_name} with a cross-validation RMSE of {best_rmse_scores[best_model_name]:.4f}")

# Train the overall best model on the entire dataset one last time.
print("Training the best model on the full training data...")
best_model.fit(X_train, y_train_log)

# Make predictions on the test data.
print("Making predictions on the test data...")
test_predictions_log = best_model.predict(X_test)

# Inverse transform the predictions back to the original sales scale.
test_predictions = np.expm1(test_predictions_log)
print("Predictions generated.")

# --- Create Submission File ---
print(f"\nCreating submission file 'submission_linear_tuned.csv'...")
submission = pd.DataFrame({
    'Item_Identifier': original_test_identifiers['Item_Identifier'],
    'Outlet_Identifier': original_test_identifiers['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})

# Save the submission file without the index.
submission.to_csv('submission_linear_tuned.csv', index=False)
print("Submission file 'submission_linear_tuned.csv' created successfully!")

Loading enhanced preprocessed training and test data...

Starting K-Fold Cross-Validation and Hyperparameter Tuning for each model...

Tuning Ridge model...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
-> Ridge completed. Best Parameters: {'alpha': 0.1}
-> Best cross-validation RMSE: 0.5356

Tuning Lasso model...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
-> Lasso completed. Best Parameters: {'alpha': 0.0001}
-> Best cross-validation RMSE: 0.5356

Tuning ElasticNet model...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
-> ElasticNet completed. Best Parameters: {'alpha': 0.0001}
-> Best cross-validation RMSE: 0.5356

Overall best model is Lasso with a cross-validation RMSE of 0.5356
Training the best model on the full training data...
Making predictions on the test data...
Predictions generated.

Creating submission file 'submission_linear_tuned.csv'...
Submission file 'submission_linear_tuned.csv' created successfully!


### 4. Creating Preprocessed Train & Test (More Detailed + One hot + Normalised)

In [41]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import category_encoders as ce

# --- Load Original Raw Data ---
# We are starting from the original raw data files to apply a full range of preprocessing steps.
print("Loading original raw training and test data...")
train_df = pd.read_csv('train_cleaned.csv')
test_df = pd.read_csv('test_AbJTz2l.csv')

# --- Data Cleaning and Feature Engineering (Common Steps) ---
# These are necessary cleaning steps applied to both datasets.

# Fill missing Item_Weight with the mean of the corresponding Item_Identifier
train_df['Item_Weight'].fillna(train_df.groupby('Item_Identifier')['Item_Weight'].transform('mean'), inplace=True)
test_df['Item_Weight'].fillna(test_df.groupby('Item_Identifier')['Item_Weight'].transform('mean'), inplace=True)

# Replace 'Unknown' Outlet_Size with mode of corresponding Outlet_Type
outlet_size_mode = train_df.groupby('Outlet_Type')['Outlet_Size'].apply(lambda x: x.mode()[0])
for outlet_type, outlet_size in outlet_size_mode.items():
    train_df.loc[(train_df['Outlet_Type'] == outlet_type) & (train_df['Outlet_Size'] == 'Unknown'), 'Outlet_Size'] = outlet_size
    test_df.loc[(test_df['Outlet_Type'] == outlet_type) & (test_df['Outlet_Size'].isnull()), 'Outlet_Size'] = outlet_size

# Fix inconsistent Item_Fat_Content
# Correct the test file's Item_Fat_Content to match the train file's cleaned values.
train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].replace(['low fat', 'reg'], ['Low Fat', 'Regular'])
test_df['Item_Fat_Content'] = test_df['Item_Fat_Content'].replace(['low fat', 'reg', 'LF'], ['Low Fat', 'Regular', 'Low Fat'])

# Create a new feature for Outlet_Years
train_df['Outlet_Years'] = 2013 - train_df['Outlet_Establishment_Year']
test_df['Outlet_Years'] = 2013 - test_df['Outlet_Establishment_Year']

# Create a new feature for Item_MRP Category
train_df['Item_MRP_Category'] = pd.cut(train_df['Item_MRP'], bins=4, labels=[1, 2, 3, 4])
test_df['Item_MRP_Category'] = pd.cut(test_df['Item_MRP'], bins=4, labels=[1, 2, 3, 4])

# Create a combined Item_Type feature for simplicity
train_df['Item_Type_Combined'] = train_df['Item_Identifier'].apply(lambda x: x[0:2])
test_df['Item_Type_Combined'] = test_df['Item_Identifier'].apply(lambda x: x[0:2])
train_df['Item_Type_Combined'] = train_df['Item_Type_Combined'].map({'FD':'Food', 'NC':'Non-Consumable', 'DR':'Drinks'})
test_df['Item_Type_Combined'] = test_df['Item_Type_Combined'].map({'FD':'Food', 'NC':'Non-Consumable', 'DR':'Drinks'})

# --- Advanced Feature Engineering ---
# These are new features that capture interactions and complex relationships.

# Create an interaction feature between Item_Type and Outlet_Type
train_df['Item_Outlet_Type'] = train_df['Item_Type'].astype(str) + '_' + train_df['Outlet_Type'].astype(str)
test_df['Item_Outlet_Type'] = test_df['Item_Type'].astype(str) + '_' + test_df['Outlet_Type'].astype(str)

# Create a new feature for Item_Visibility category
# We'll handle the zero visibility values by replacing them with the mean before binning.
train_df['Item_Visibility'].replace(0, train_df['Item_Visibility'].mean(), inplace=True)
test_df['Item_Visibility'].replace(0, test_df['Item_Visibility'].mean(), inplace=True)
train_df['Item_Visibility_Category'] = pd.qcut(train_df['Item_Visibility'], 4, labels=['Low', 'Medium', 'High', 'Very_High'])
test_df['Item_Visibility_Category'] = pd.qcut(test_df['Item_Visibility'], 4, labels=['Low', 'Medium', 'High', 'Very_High'])

# --- Apply Multiple Encoding Techniques ---

# One-Hot Encoding for nominal features, applied to each dataset separately.
train_df_encoded = pd.get_dummies(train_df, columns=['Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Fat_Content', 'Item_Outlet_Type', 'Item_Visibility_Category'], dtype=int)
test_df_encoded = pd.get_dummies(test_df, columns=['Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Fat_Content', 'Item_Outlet_Type', 'Item_Visibility_Category'], dtype=int)

# Target Encoding for a high-cardinality feature. We fit on the training data and transform both.
encoder = ce.TargetEncoder(cols=['Item_Type'])
train_df_encoded['Item_Type_Encoded'] = encoder.fit_transform(train_df_encoded['Item_Type'], train_df_encoded['Item_Outlet_Sales'])
test_df_encoded['Item_Type_Encoded'] = encoder.transform(test_df_encoded['Item_Type'])
# Fill any missing values created by the target encoder on the test set
test_df_encoded['Item_Type_Encoded'].fillna(train_df_encoded['Item_Type_Encoded'].mean(), inplace=True)

# Label Encoding for ordinal-like features
le = LabelEncoder()
train_df_encoded['Item_MRP_Category_LE'] = le.fit_transform(train_df_encoded['Item_MRP_Category'])
test_df_encoded['Item_MRP_Category_LE'] = le.transform(test_df_encoded['Item_MRP_Category'])
train_df_encoded['Item_Type_Combined_LE'] = le.fit_transform(train_df_encoded['Item_Type_Combined'])
test_df_encoded['Item_Type_Combined_LE'] = le.transform(test_df_encoded['Item_Type_Combined'])

# --- Normalize Numerical Features ---
# We normalize key numerical features to a standard scale (0-1).
# This is crucial for models that are sensitive to feature magnitude.
print("\nNormalizing numerical features...")
scaler = MinMaxScaler()
numerical_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP']

# Fit the scaler on the training data and transform both datasets.
train_df_encoded[numerical_cols] = scaler.fit_transform(train_df_encoded[numerical_cols])
test_df_encoded[numerical_cols] = scaler.transform(test_df_encoded[numerical_cols])


# --- Select Final Features and Save Files ---
# Define a list of final features to ensure consistency between train and test sets.
base_features = [
    'Item_Weight', 'Item_Visibility', 'Outlet_Years', 'Item_MRP',
    'Item_Type_Encoded', 'Item_MRP_Category_LE', 'Item_Type_Combined_LE'
]
# Get all one-hot encoded columns from the training set.
ohe_cols_train = [col for col in train_df_encoded.columns if any(x in col for x in ['Outlet_Size_', 'Outlet_Location_Type_', 'Outlet_Type_', 'Item_Fat_Content_', 'Item_Outlet_Type_', 'Item_Visibility_Category_'])]

final_features = base_features + ohe_cols_train

# Align columns to ensure both datasets have the same features.
X_train_final = train_df_encoded[final_features]
X_test_final = test_df_encoded.reindex(columns=final_features, fill_value=0)

y_train_final = train_df_encoded['Item_Outlet_Sales']

# Save the new preprocessed files
print("\nSaving new preprocessed datasets...")
X_train_final.to_csv('preprocessed_train_advanced.csv', index=False)
y_train_final.to_csv('preprocessed_train_target.csv', index=False, header=True)
X_test_final.to_csv('preprocessed_test_advanced.csv', index=False)

print("\nAdvanced preprocessing complete! Files 'preprocessed_train_advanced.csv', 'preprocessed_train_target.csv', and 'preprocessed_test_advanced.csv' have been created.")

Loading original raw training and test data...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Item_Weight'].fillna(train_df.groupby('Item_Identifier')['Item_Weight'].transform('mean'), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Item_Weight'].fillna(test_df.groupby('Item_Identifier')['Item_Weight'].transform('mean'), inplace=True)
The 


Normalizing numerical features...

Saving new preprocessed datasets...

Advanced preprocessing complete! Files 'preprocessed_train_advanced.csv', 'preprocessed_train_target.csv', and 'preprocessed_test_advanced.csv' have been created.


### 5. Using Advanced Preprocessed Data for Model Comparison

In [42]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, RegressorMixin

# --- Load the Advanced Preprocessed Data ---
# This code loads the training features, training target, and test features
# that were generated by your previous preprocessing script, now with normalized columns.
print("Loading advanced preprocessed training and test data...")
try:
    X_train = pd.read_csv('preprocessed_train_advanced.csv')
    y_train = pd.read_csv('preprocessed_train_target.csv').iloc[:, 0]
    X_test = pd.read_csv('preprocessed_test_advanced.csv')
except FileNotFoundError as e:
    print(f"Error: Required file not found. Please ensure that 'preprocessed_train_advanced.csv', 'preprocessed_train_target.csv', and 'preprocessed_test_advanced.csv' have been generated. Details: {e}")
    exit()

# --- Check for and Handle NaN Values ---
# It's crucial to ensure there are no missing values before training models.
# Let's check the number of NaNs in each DataFrame.
print("\nChecking for missing values...")
print("NaNs in X_train:")
print(X_train.isnull().sum())
print("\nNaNs in X_test:")
print(X_test.isnull().sum())

# Fill any remaining NaNs with a value (e.g., 0) to prevent model errors.
# This is a critical step for models like Linear Regression and Gradient Boosting.
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
print("\nAll missing values have been filled with 0.")

# --- Split the Training Data ---
# It's good practice to split the training data into a smaller training set
# and a validation set to evaluate model performance on unseen data.
print("Splitting training data into training and validation sets...")
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# --- Initialize and Train the Models ---
# We use a dictionary to hold our models for easy iteration and comparison.
print("\nInitializing and training models...")
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# --- Compare Model Performance ---
# Loop through each model, train it, make predictions, and evaluate its performance.
print("\n--- Model Performance Comparison ---")
for name, model in models.items():
    print(f"\nTraining {name}...")
    try:
        model.fit(X_train_split, y_train_split)
        y_pred = model.predict(X_val_split)
        
        # Evaluate the model's performance using Root Mean Squared Error (RMSE) and R-squared.
        rmse = np.sqrt(mean_squared_error(y_val_split, y_pred))
        r2 = r2_score(y_val_split, y_pred)
        
        print(f"{name} RMSE: {rmse:.2f}")
        print(f"{name} R-squared: {r2:.2f}")
    except Exception as e:
        print(f"An error occurred during training or evaluation of {name}. Details: {e}")

# --- Add Blender Class and Grid Search for Blending ---
# Define a custom scikit-learn compatible estimator for model blending.
# This class needs to be defined within the script to be used with GridSearchCV.
class Blender(BaseEstimator, RegressorMixin):
    """
    A custom scikit-learn compatible regressor for blending predictions from two base models.
    """
    def __init__(self, weight=0.5):
        self.weight = weight
        self.reg1 = LinearRegression()
        # Using the same random state for consistency.
        self.reg2 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        
    def fit(self, X, y):
        """Fits the two base regressors on the training data."""
        self.reg1.fit(X, y)
        self.reg2.fit(X, y)
        return self
        
    def predict(self, X):
        """
        Combines predictions from the two base models using a weighted average.
        
        Args:
            X (pd.DataFrame): The feature data to make predictions on.

        Returns:
            np.ndarray: The blended predictions.
        """
        pred1 = self.reg1.predict(X)
        pred2 = self.reg2.predict(X)
        return self.weight * pred1 + (1 - self.weight) * pred2
    
    def get_params(self, deep=True):
        """Required method for scikit-learn compatibility."""
        return {'weight': self.weight}

# Initialize the blending model and find the optimal weight using a grid search.
blender = Blender()
blending_param_grid = {'weight': np.arange(0.0, 1.01, 0.1)}

print("\nFinding optimal blending weight using GridSearchCV...")
blending_grid = GridSearchCV(
    estimator=blender,
    param_grid=blending_param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

try:
    # Fit the grid search on the training data.
    blending_grid.fit(X_train_split, y_train_split)
    best_weight = blending_grid.best_params_['weight']
    print(f"Optimal blending weight found: {best_weight:.2f}")
except Exception as e:
    print(f"An error occurred during GridSearchCV. Please check your data and class definition. Details: {e}")
    best_weight = 0.5 # Use a default weight if the grid search fails.
    print(f"Using default blending weight of {best_weight:.2f}.")

# --- Final Training and Prediction ---
# We will use the best-performing model (or the newly optimized blender)
# to make predictions on the final test set.
print("\n--- Final Prediction on Test Data ---")

# For demonstration, we'll use the best model found by the grid search.
# This is a good practice as it has been validated on your training data.
# The try/except block handles the AttributeError if GridSearchCV fails.
try:
    best_model = blending_grid.best_estimator_
except AttributeError:
    print("GridSearchCV failed to find a best estimator. Defaulting to a Gradient Boosting Regressor.")
    best_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

print(f"Training the best model on the full training dataset...")
best_model.fit(X_train, y_train)

# Make the final predictions
test_predictions = best_model.predict(X_test)

# --- Post-processing and Final Output ---
# This section ensures the final output is in the correct format for submission.
print("\nCorrecting final predictions for submission format...")

# First, clip negative values to 0, as sales cannot be negative.
test_predictions[test_predictions < 0] = 0

# Next, load the original test file to get the identifiers.
test_original_df = pd.read_csv('test_AbJTz2l.csv')

# Create the final submission DataFrame.
submission_df = pd.DataFrame({
    'Item_Identifier': test_original_df['Item_Identifier'],
    'Outlet_Identifier': test_original_df['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})

# Save the predictions to a CSV file.
submission_df.to_csv('final_predictions.csv', index=False)
print("Final predictions saved to 'final_predictions.csv' in the correct format!")

Loading advanced preprocessed training and test data...

Checking for missing values...
NaNs in X_train:
Item_Weight                                         4
Item_Visibility                                     0
Outlet_Years                                        0
Item_MRP                                            0
Item_Type_Encoded                                   0
                                                   ..
Item_Outlet_Type_Starchy Foods_Supermarket Type3    0
Item_Visibility_Category_Low                        0
Item_Visibility_Category_Medium                     0
Item_Visibility_Category_High                       0
Item_Visibility_Category_Very_High                  0
Length: 88, dtype: int64

NaNs in X_test:
Item_Weight                                         20
Item_Visibility                                      0
Outlet_Years                                         0
Item_MRP                                             0
Item_Type_Encoded                      

### 6. Creating Better Preprocessed Train & Test (More Detailed + One hot + Normalised)

In [46]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# --- Load Original Raw Data ---
# We are starting from the original raw data files to apply a full range of preprocessing steps.
print("Loading original raw training and test data...")
train_df = pd.read_csv('train_cleaned.csv')
test_df = pd.read_csv('test_AbJTz2l.csv')

# --- Data Cleaning and Feature Engineering (Common Steps) ---
# These are necessary cleaning steps applied to both datasets.

# Fill missing Item_Weight with the mean of the corresponding Item_Identifier
train_df['Item_Weight'].fillna(train_df.groupby('Item_Identifier')['Item_Weight'].transform('mean'), inplace=True)
test_df['Item_Weight'].fillna(test_df.groupby('Item_Identifier')['Item_Weight'].transform('mean'), inplace=True)

# Drop Outlet_Establishment_Year as it is not needed.
train_df.drop('Outlet_Establishment_Year', axis=1, inplace=True)
test_df.drop('Outlet_Establishment_Year', axis=1, inplace=True)

# Fix inconsistent Item_Fat_Content values
train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].replace(['low fat', 'reg'], ['Low Fat', 'Regular'])
test_df['Item_Fat_Content'] = test_df['Item_Fat_Content'].replace(['low fat', 'reg', 'LF'], ['Low Fat', 'Regular', 'Low Fat'])

# --- Normalize and Correct Skewness for Numerical Features ---
print("\nLog-transforming 'Item_Visibility' to correct for skewness...")
# We use np.log1p to handle any potential zero values gracefully.
train_df['Item_Visibility'] = np.log1p(train_df['Item_Visibility'])
test_df['Item_Visibility'] = np.log1p(test_df['Item_Visibility'])

# Apply Min-Max scaling to the remaining numerical features: Item_Weight and Item_MRP.
print("Applying Min-Max scaling to 'Item_Weight' and 'Item_MRP'...")
scaler = MinMaxScaler()
numerical_cols = ['Item_Weight', 'Item_MRP']

# Fit the scaler on the training data and transform both datasets.
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

# --- Apply One-Hot Encoding ---
# One-Hot Encoding for all nominal features.
print("Applying one-hot encoding to categorical features...")
one_hot_cols = ['Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Fat_Content', 'Item_Type']
train_df_encoded = pd.get_dummies(train_df, columns=one_hot_cols, dtype=int)
test_df_encoded = pd.get_dummies(test_df, columns=one_hot_cols, dtype=int)

# Align columns to ensure both datasets have the same features after one-hot encoding.
X_train_final, X_test_final = train_df_encoded.align(test_df_encoded, join='left', axis=1, fill_value=0)

# --- Select Final Features and Save Files ---
# Define the target variable for the training set.
y_train_final = X_train_final['Item_Outlet_Sales']

# Drop the columns from the final feature sets that are not needed for the model.
columns_to_drop = ['Item_Outlet_Sales', 'Item_Identifier', 'Outlet_Identifier']
X_train_final.drop(columns_to_drop, axis=1, errors='ignore', inplace=True)
X_test_final.drop(columns_to_drop, axis=1, errors='ignore', inplace=True)

# Save the new preprocessed files
print("\nSaving new preprocessed datasets...")
X_train_final.to_csv('preprocessed_train_simplified.csv', index=False)
y_train_final.to_csv('preprocessed_train_target_simplified.csv', index=False, header=True)
X_test_final.to_csv('preprocessed_test_simplified.csv', index=False)

print("\nSimplified preprocessing complete! Files 'preprocessed_train_simplified.csv', 'preprocessed_train_target_simplified.csv', and 'preprocessed_test_simplified.csv' have been created.")

Loading original raw training and test data...

Log-transforming 'Item_Visibility' to correct for skewness...
Applying Min-Max scaling to 'Item_Weight' and 'Item_MRP'...
Applying one-hot encoding to categorical features...

Saving new preprocessed datasets...

Simplified preprocessing complete! Files 'preprocessed_train_simplified.csv', 'preprocessed_train_target_simplified.csv', and 'preprocessed_test_simplified.csv' have been created.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Item_Weight'].fillna(train_df.groupby('Item_Identifier')['Item_Weight'].transform('mean'), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Item_Weight'].fillna(test_df.groupby('Item_Identifier')['Item_Weight'].transform('mean'), inplace=True)


### 7. Model Fitting (Simplified)

#### 7.1. Comparing Linear Regression, Gradient Boost & Random Forest

In [47]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, RegressorMixin

# --- Load the Advanced Preprocessed Data ---
# This code loads the training features, training target, and test features
# that were generated by your previous preprocessing script, now with normalized columns.
print("Loading advanced preprocessed training and test data...")
try:
    X_train = pd.read_csv('preprocessed_train_simplified.csv')
    y_train = pd.read_csv('preprocessed_train_target_simplified.csv').iloc[:, 0]
    X_test = pd.read_csv('preprocessed_test_simplified.csv')
except FileNotFoundError as e:
    print(f"Error: Required file not found. Please ensure that 'preprocessed_train_advanced.csv', 'preprocessed_train_target.csv', and 'preprocessed_test_advanced.csv' have been generated. Details: {e}")
    exit()

# --- Check for and Handle NaN Values ---
# It's crucial to ensure there are no missing values before training models.
# Let's check the number of NaNs in each DataFrame.
print("\nChecking for missing values...")
print("NaNs in X_train:")
print(X_train.isnull().sum())
print("\nNaNs in X_test:")
print(X_test.isnull().sum())

# Fill any remaining NaNs with a value (e.g., 0) to prevent model errors.
# This is a critical step for models like Linear Regression and Gradient Boosting.
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
print("\nAll missing values have been filled with 0.")

# --- Split the Training Data ---
# It's good practice to split the training data into a smaller training set
# and a validation set to evaluate model performance on unseen data.
print("Splitting training data into training and validation sets...")
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# --- Initialize and Train the Models ---
# We use a dictionary to hold our models for easy iteration and comparison.
print("\nInitializing and training models...")
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# --- Compare Model Performance ---
# Loop through each model, train it, make predictions, and evaluate its performance.
print("\n--- Model Performance Comparison ---")
for name, model in models.items():
    print(f"\nTraining {name}...")
    try:
        model.fit(X_train_split, y_train_split)
        y_pred = model.predict(X_val_split)
        
        # Evaluate the model's performance using Root Mean Squared Error (RMSE) and R-squared.
        rmse = np.sqrt(mean_squared_error(y_val_split, y_pred))
        r2 = r2_score(y_val_split, y_pred)
        
        print(f"{name} RMSE: {rmse:.2f}")
        print(f"{name} R-squared: {r2:.2f}")
    except Exception as e:
        print(f"An error occurred during training or evaluation of {name}. Details: {e}")

# --- Add Blender Class and Grid Search for Blending ---
# Define a custom scikit-learn compatible estimator for model blending.
# This class needs to be defined within the script to be used with GridSearchCV.
class Blender(BaseEstimator, RegressorMixin):
    """
    A custom scikit-learn compatible regressor for blending predictions from two base models.
    """
    def __init__(self, weight=0.5):
        self.weight = weight
        self.reg1 = LinearRegression()
        # Using the same random state for consistency.
        self.reg2 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
        
    def fit(self, X, y):
        """Fits the two base regressors on the training data."""
        self.reg1.fit(X, y)
        self.reg2.fit(X, y)
        return self
        
    def predict(self, X):
        """
        Combines predictions from the two base models using a weighted average.
        
        Args:
            X (pd.DataFrame): The feature data to make predictions on.

        Returns:
            np.ndarray: The blended predictions.
        """
        pred1 = self.reg1.predict(X)
        pred2 = self.reg2.predict(X)
        return self.weight * pred1 + (1 - self.weight) * pred2
    
    def get_params(self, deep=True):
        """Required method for scikit-learn compatibility."""
        return {'weight': self.weight}

# Initialize the blending model and find the optimal weight using a grid search.
blender = Blender()
blending_param_grid = {'weight': np.arange(0.0, 1.01, 0.1)}

print("\nFinding optimal blending weight using GridSearchCV...")
blending_grid = GridSearchCV(
    estimator=blender,
    param_grid=blending_param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

try:
    # Fit the grid search on the training data.
    blending_grid.fit(X_train_split, y_train_split)
    best_weight = blending_grid.best_params_['weight']
    print(f"Optimal blending weight found: {best_weight:.2f}")
except Exception as e:
    print(f"An error occurred during GridSearchCV. Please check your data and class definition. Details: {e}")
    best_weight = 0.5 # Use a default weight if the grid search fails.
    print(f"Using default blending weight of {best_weight:.2f}.")

# --- Final Training and Prediction ---
# We will use the best-performing model (or the newly optimized blender)
# to make predictions on the final test set.
print("\n--- Final Prediction on Test Data ---")

# For demonstration, we'll use the best model found by the grid search.
# This is a good practice as it has been validated on your training data.
# The try/except block handles the AttributeError if GridSearchCV fails.
try:
    best_model = blending_grid.best_estimator_
except AttributeError:
    print("GridSearchCV failed to find a best estimator. Defaulting to a Gradient Boosting Regressor.")
    best_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

print(f"Training the best model on the full training dataset...")
best_model.fit(X_train, y_train)

# Make the final predictions
test_predictions = best_model.predict(X_test)

# --- Post-processing and Final Output ---
# This section ensures the final output is in the correct format for submission.
print("\nCorrecting final predictions for submission format...")

# First, clip negative values to 0, as sales cannot be negative.
test_predictions[test_predictions < 0] = 0

# Next, load the original test file to get the identifiers.
test_original_df = pd.read_csv('test_AbJTz2l.csv')

# Create the final submission DataFrame.
submission_df = pd.DataFrame({
    'Item_Identifier': test_original_df['Item_Identifier'],
    'Outlet_Identifier': test_original_df['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})

# Save the predictions to a CSV file.
submission_df.to_csv('final_predictions_simplified.csv', index=False)
print("Final predictions saved to 'final_predictions_simplified.csv' in the correct format!")

Loading advanced preprocessed training and test data...

Checking for missing values...
NaNs in X_train:
Item_Weight                        4
Item_Visibility                    0
Item_MRP                           0
Outlet_Size_High                   0
Outlet_Size_Medium                 0
Outlet_Size_Small                  0
Outlet_Size_Unknown                0
Outlet_Location_Type_Tier 1        0
Outlet_Location_Type_Tier 2        0
Outlet_Location_Type_Tier 3        0
Outlet_Type_Grocery Store          0
Outlet_Type_Supermarket Type1      0
Outlet_Type_Supermarket Type2      0
Outlet_Type_Supermarket Type3      0
Item_Fat_Content_Low Fat           0
Item_Fat_Content_Regular           0
Item_Type_Baking Goods             0
Item_Type_Breads                   0
Item_Type_Breakfast                0
Item_Type_Canned                   0
Item_Type_Dairy                    0
Item_Type_Frozen Foods             0
Item_Type_Fruits and Vegetables    0
Item_Type_Hard Drinks              0
Item_Ty

### 8. Final Hyperparameter Tuning

#### 8.1. Random Forest

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# --- Load the Preprocessed Data ---
# Loading the training features, training target, and test features from the
# files we generated in previous steps.
print("Loading preprocessed training and test data from our shared files...")
try:
    # X_train contains the features
    X_train = pd.read_csv('preprocessed_train_simplified.csv')
    # y_train contains the target variable 'Item_Outlet_Sales'
    y_train = pd.read_csv('preprocessed_train_target_simplified.csv').iloc[:, 0]
    # X_test contains the features for the test set
    X_test = pd.read_csv('preprocessed_test_simplified.csv')
    # We also need the original test file to get the identifiers for the submission
    original_test_identifiers = pd.read_csv('test_AbJTz2l.csv')[['Item_Identifier', 'Outlet_Identifier']]
except FileNotFoundError as e:
    print(f"Error: Required file not found. Please ensure that 'preprocessed_train_simplified.csv', 'preprocessed_train_target_simplified.csv', 'preprocessed_test_simplified.csv', and 'test_AbJTz2l.csv' are in the directory. Details: {e}")
    exit()

# --- Log-Transform the Target Variable ---
# It is a common practice to log-transform skewed target variables to
# improve the performance of regression models. We add 1 before taking the log
# to handle any zero values.
print("\nLog-transforming the target variable 'Item_Outlet_Sales'...")
y_train_log = np.log1p(y_train)

# --- Hyperparameter Tuning with GridSearchCV ---
print("\nStarting hyperparameter tuning using GridSearchCV with RandomForestRegressor...")

# Initialize the RandomForest Regressor model.
# A random forest is an ensemble of decision trees.
rf_reg = RandomForestRegressor(random_state=42, n_jobs=-1)

# Define a parameter grid for the RandomForestRegressor.
# We will tune key parameters that control model complexity and prevent overfitting.
param_grid = {
    'n_estimators': [100, 200, 500],  # Number of trees in the forest
    'max_depth': [10, 20, None],      # Maximum depth of the trees
    'min_samples_leaf': [1, 2, 4],    # Minimum number of samples required to be at a leaf node
}

# Set up GridSearchCV.
# 'scoring'='neg_mean_squared_error' is used to find the model with the lowest MSE.
# 'cv=3' means we will perform 3-fold cross-validation.
# 'n_jobs=-1' uses all available CPU cores to speed up the process.
grid_search = GridSearchCV(
    estimator=rf_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit the grid search to the data. This will run the cross-validation
# for every combination of parameters in the grid.
grid_search.fit(X_train, y_train_log)

# Print the best parameters and the best score found.
print("\nBest parameters found: ", grid_search.best_params_)
print("Best cross-validation score (negative RMSE): {:.2f}".format(grid_search.best_score_))

# --- Make Predictions with the Best Model ---
print("\nMaking predictions on the test data with the best model...")

# The best model is stored in the `best_estimator_` attribute of the grid search object.
best_rf_model = grid_search.best_estimator_
test_predictions_log = best_rf_model.predict(X_test)

# Inverse transform the predictions from the log scale back to the original scale.
# np.expm1 is the inverse function of np.log1p.
test_predictions = np.expm1(test_predictions_log)
print("Predictions generated.")

# --- Create Submission File ---
# The submission file requires the original item and outlet identifiers.
print("\nCreating new submission file 'submission_tuned_final.csv'...")
submission = pd.DataFrame({
    'Item_Identifier': original_test_identifiers['Item_Identifier'],
    'Outlet_Identifier': original_test_identifiers['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})

# Save the submission file to a CSV without the index column.
submission.to_csv('submission_tuned_final.csv', index=False)
print("Submission file 'submission_tuned_final.csv' created successfully!")

Loading preprocessed training and test data from our shared files...

Log-transforming the target variable 'Item_Outlet_Sales'...

Starting hyperparameter tuning using GridSearchCV with RandomForestRegressor...
Fitting 3 folds for each of 27 candidates, totalling 81 fits

Best parameters found:  {'max_depth': 10, 'min_samples_leaf': 4, 'n_estimators': 500}
Best cross-validation score (negative RMSE): -0.28

Making predictions on the test data with the best model...
Predictions generated.

Creating new submission file 'submission_tuned_final.csv'...
Submission file 'submission_tuned_final.csv' created successfully!


#### 8.2. Gradient Boost

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# --- Load the Advanced Preprocessed Data ---
# Loading the preprocessed training and test data. This code assumes
# the files 'preprocessed_train_simplified.csv', 'preprocessed_train_target_simplified.csv',
# 'preprocessed_test_simplified.csv', and 'test_AbJTz2l.csv' exist.
print("Loading preprocessed training and test data...")
try:
    # X_train contains the features for training
    X_train = pd.read_csv('preprocessed_train_simplified.csv')
    # y_train contains the target variable 'Item_Outlet_Sales'
    y_train = pd.read_csv('preprocessed_train_target_simplified.csv').iloc[:, 0]
    # X_test contains the features for the test set
    X_test = pd.read_csv('preprocessed_test_simplified.csv')
    # We also need the original test file to get the identifiers for the submission
    original_test_identifiers = pd.read_csv('test_AbJTz2l.csv')[['Item_Identifier', 'Outlet_Identifier']]
except FileNotFoundError as e:
    print(f"Error: Required file not found. Please ensure that 'preprocessed_train_simplified.csv', 'preprocessed_train_target_simplified.csv', 'preprocessed_test_simplified.csv', and 'test_AbJTz2l.csv' are in the directory. Details: {e}")
    exit()

# --- Handle NaN Values (just in case) ---
# Although preprocessing should handle this, it's a good practice to double-check.
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
print("Missing values in dataframes filled with 0.")

# --- Log-Transform the Target Variable ---
# Log-transforming the target variable to handle skewness, which can lead to
# better model performance. We add 1 to the sales to handle potential zero values.
print("\nLog-transforming the target variable 'Item_Outlet_Sales'...")
y_train_log = np.log1p(y_train)

# --- Hyperparameter Tuning with GridSearchCV ---
print("\nStarting hyperparameter tuning with GridSearchCV for GradientBoostingRegressor...")

# Initialize the GradientBoostingRegressor model.
gb_reg = GradientBoostingRegressor(random_state=42)

# Define a parameter grid to search.
# We are tuning a few key parameters:
# - n_estimators: The number of boosting stages to perform.
# - max_depth: The maximum depth of the individual regression estimators.
# - learning_rate: Controls the contribution of each tree.
# - subsample: The fraction of samples to be used for fitting the individual base learners.
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9]
}

# Set up GridSearchCV.
# 'scoring'='neg_mean_squared_error' is used to find the model with the lowest MSE.
# 'cv=3' means we will perform 3-fold cross-validation.
# We set n_jobs=1 to avoid the multiprocessing error from the previous conversation.
grid_search = GridSearchCV(
    estimator=gb_reg,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    n_jobs=1
)

# Fit the grid search to the data. This will train a model for every combination
# of parameters in the grid and find the best one.
grid_search.fit(X_train, y_train_log)

# Print the best parameters and the best score found.
print("\nBest parameters found: ", grid_search.best_params_)
# The score is negative, so we convert it back to a positive Root Mean Squared Error (RMSE)
best_score_rmse = np.sqrt(-grid_search.best_score_)
print(f"Best cross-validation score (RMSE): {best_score_rmse:.2f}")

# --- Make Predictions with the Best Model ---
print("\nMaking predictions on the test data with the best model...")

# The best model is stored in the `best_estimator_` attribute of the grid search object.
best_gb_model = grid_search.best_estimator_
test_predictions_log = best_gb_model.predict(X_test)

# Inverse transform the predictions from the log scale back to the original sales scale.
test_predictions = np.expm1(test_predictions_log)
print("Predictions generated.")

# --- Post-processing and Final Output ---
# This section ensures the final output is in the correct format for submission.
print("\nCorrecting final predictions for submission format...")

# Clip negative values to 0, as sales cannot be negative.
test_predictions[test_predictions < 0] = 0

# Create the final submission DataFrame.
submission_df = pd.DataFrame({
    'Item_Identifier': original_test_identifiers['Item_Identifier'],
    'Outlet_Identifier': original_test_identifiers['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})

# Save the predictions to a CSV file.
submission_df.to_csv('submission_tuned_v3.csv', index=False)
print("Final predictions saved to 'submission_tuned_v3.csv'!")


Loading preprocessed training and test data...
Missing values in dataframes filled with 0.

Log-transforming the target variable 'Item_Outlet_Sales'...

Starting hyperparameter tuning with GridSearchCV for GradientBoostingRegressor...
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.8s
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.9s
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.7; total time=   0.8s
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.9s
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.9s
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.9s
[CV] END learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.9; total time=   1.0s
[CV] END learning_rate=0.05, max_depth=3, n_e

In [6]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# --- Load the Advanced Preprocessed Data ---
# Loading the training features, training target, and test features
# that were generated by your previous preprocessing script.
print("Loading preprocessed training and test data...")
try:
    # X_train contains the features for training
    X_train = pd.read_csv('preprocessed_train_simplified.csv')
    # y_train contains the target variable 'Item_Outlet_Sales'
    y_train = pd.read_csv('preprocessed_train_target_simplified.csv').iloc[:, 0]
    # X_test contains the features for the test set
    X_test = pd.read_csv('preprocessed_test_simplified.csv')
    # We also need the original test file to get the identifiers for the submission
    original_test_identifiers = pd.read_csv('test_AbJTz2l.csv')[['Item_Identifier', 'Outlet_Identifier']]
except FileNotFoundError as e:
    print(f"Error: Required file not found. Please ensure that 'preprocessed_train_simplified.csv', 'preprocessed_train_target_simplified.csv', 'preprocessed_test_simplified.csv', and 'test_AbJTz2l.csv' are in the directory. Details: {e}")
    exit()

# --- Handle NaN Values (just in case) ---
# It's always a good practice to ensure there are no missing values before training.
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
print("Missing values in dataframes filled with 0.")

# --- Log-Transform the Target Variable ---
# Log-transforming the target variable to handle its right skewness, which
# can help models learn better.
print("\nLog-transforming the target variable 'Item_Outlet_Sales'...")
y_train_log = np.log1p(y_train)

# --- Define Models for Comparison ---
# Here we define a dictionary of models to be compared using cross-validation.
# We are using default parameters for a quick and fair comparison.
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
}

# --- Perform Cross-Validation and Compare Models ---
print("\n--- Model Performance Comparison with 5-Fold Cross-Validation ---")
results = {}
for name, model in models.items():
    print(f"\nEvaluating {name}...")
    try:
        # We use cross_val_score with 'neg_mean_squared_error' to get a reliable
        # performance metric. The negative sign is because scikit-learn
        # uses a scoring convention where higher values are better.
        scores = cross_val_score(model, X_train, y_train_log, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        
        # Convert scores to RMSE (Root Mean Squared Error) for better interpretability.
        # We take the negative of the scores and then the square root.
        rmse_scores = np.sqrt(-scores)
        
        # Store the mean and standard deviation of the RMSE scores.
        results[name] = {'mean_rmse': np.mean(rmse_scores), 'std_rmse': np.std(rmse_scores)}
        print(f"{name} Mean RMSE: {results[name]['mean_rmse']:.2f}")
        print(f"{name} Std Dev RMSE: {results[name]['std_rmse']:.2f}")
    except Exception as e:
        print(f"An error occurred during cross-validation for {name}. Details: {e}")
        # Use a placeholder for failed models to prevent script termination.
        results[name] = {'mean_rmse': np.inf, 'std_rmse': np.inf}

# --- Select the Best Model ---
# Find the model with the lowest mean RMSE.
best_model_name = min(results, key=lambda k: results[k]['mean_rmse'])
best_model_instance = models[best_model_name]
print(f"\n--- Best Model Found ---")
print(f"The best model is {best_model_name} with a mean cross-validation RMSE of {results[best_model_name]['mean_rmse']:.2f}")

# --- Final Training and Prediction ---
# Train the best-performing model on the full training dataset.
print(f"\nTraining the {best_model_name} on the entire training dataset...")
best_model_instance.fit(X_train, y_train_log)

# Make the final predictions on the test set.
test_predictions_log = best_model_instance.predict(X_test)

# Inverse transform the predictions from the log scale back to the original sales scale.
test_predictions = np.expm1(test_predictions_log)

# --- Post-processing and Final Output ---
# Clip any negative predictions to 0, as sales cannot be negative.
test_predictions[test_predictions < 0] = 0

# Create the final submission DataFrame with the correct identifiers.
submission_df = pd.DataFrame({
    'Item_Identifier': original_test_identifiers['Item_Identifier'],
    'Outlet_Identifier': original_test_identifiers['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})

# Save the predictions to a CSV file.
submission_df.to_csv('submission_tuned_v4.csv', index=False)
print("\nFinal predictions saved to 'submission_tuned_v4.csv' in the correct format!")


Loading preprocessed training and test data...
Missing values in dataframes filled with 0.

Log-transforming the target variable 'Item_Outlet_Sales'...

--- Model Performance Comparison with 5-Fold Cross-Validation ---

Evaluating Linear Regression...
Linear Regression Mean RMSE: 0.54
Linear Regression Std Dev RMSE: 0.01

Evaluating Ridge...
Ridge Mean RMSE: 0.54
Ridge Std Dev RMSE: 0.01

Evaluating Lasso...
Lasso Mean RMSE: 1.02
Lasso Std Dev RMSE: 0.02

Evaluating Random Forest...
Random Forest Mean RMSE: 0.56
Random Forest Std Dev RMSE: 0.01

Evaluating Gradient Boosting...
Gradient Boosting Mean RMSE: 0.52
Gradient Boosting Std Dev RMSE: 0.01

Evaluating XGBoost...
XGBoost Mean RMSE: 0.57
XGBoost Std Dev RMSE: 0.01

--- Best Model Found ---
The best model is Gradient Boosting with a mean cross-validation RMSE of 0.52

Training the Gradient Boosting on the entire training dataset...

Final predictions saved to 'submission_tuned_v4.csv' in the correct format!
