In [24]:
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
import warnings
import os
from datetime import datetime
import json

warnings.filterwarnings('ignore')

In [25]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
models_path = '/content/drive/MyDrive/models'

try:
    xgb_model = joblib.load(f'{models_path}/xgboost_model.pkl')
    rf_model = joblib.load(f'{models_path}/random_forest_model.pkl')
    lr_model = joblib.load(f'{models_path}/linear_regression_model.pkl')
    label_encoders = joblib.load(f'{models_path}/label_encoders.pkl')
    feature_cols = joblib.load(f'{models_path}/feature_columns.pkl')
    print("All models loaded successfully from Google Drive")
except FileNotFoundError as e:
    print(f"Error loading models: {e}")

All models loaded successfully from Google Drive


In [27]:
test_full = pd.read_csv('/content/drive/MyDrive/data/test_processed.csv')

In [28]:
test = pd.read_csv('/content/drive/MyDrive/data/test.csv')

In [29]:
print(test_full)

        Store  Dept        Date Type    Size  Temperature  Fuel_Price  \
0           1     1  2012-11-02    A  151315        55.32       3.386   
1           1     1  2012-11-09    A  151315        61.24       3.314   
2           1     1  2012-11-16    A  151315        52.92       3.252   
3           1     1  2012-11-23    A  151315        56.23       3.211   
4           1     1  2012-11-30    A  151315        52.34       3.207   
...       ...   ...         ...  ...     ...          ...         ...   
115059     45    98  2013-06-28    B  118221        76.05       3.639   
115060     45    98  2013-07-05    B  118221        77.50       3.614   
115061     45    98  2013-07-12    B  118221        79.37       3.614   
115062     45    98  2013-07-19    B  118221        82.84       3.737   
115063     45    98  2013-07-26    B  118221        76.06       3.804   

        MarkDown1  MarkDown2  MarkDown3  ...  BackToSchool  Store_Sales_Mean  \
0         6766.44    5147.70      50.82  ..

In [30]:
print(test)

        Store  Dept        Date  IsHoliday
0           1     1  2012-11-02      False
1           1     1  2012-11-09      False
2           1     1  2012-11-16      False
3           1     1  2012-11-23       True
4           1     1  2012-11-30      False
...       ...   ...         ...        ...
115059     45    98  2013-06-28      False
115060     45    98  2013-07-05      False
115061     45    98  2013-07-12      False
115062     45    98  2013-07-19      False
115063     45    98  2013-07-26      False

[115064 rows x 4 columns]


In [31]:
CONFIG = {
    'models_path': '/content/drive/MyDrive/models',
    'data_path': '/content/drive/MyDrive/data',
    'output_path': '/content/drive/MyDrive/predictions',
    'ensemble_weights': {
        'xgboost': 0.5,
        'random_forest': 0.3,
        'linear_regression': 0.2
    },
    'categorical_features': ['Type', 'Season', 'Type_Size_Interaction', 'Holiday_Type_Interaction', 'Temp_Category']
}

In [32]:
features_to_drop = [
    'Date',
    'Weekly_Sales',
    'Weekly_Sales_Lag1', 'Weekly_Sales_Lag4', 'Weekly_Sales_Lag52',
    'Weekly_Sales_Roll4_Mean', 'Weekly_Sales_Roll12_Mean'
]

X_test = test_full[feature_cols].copy()
print(f"Test features shape: {X_test.shape}")

Test features shape: (115064, 34)


In [33]:
categorical_features = ['Type', 'Season', 'Type_Size_Interaction', 'Holiday_Type_Interaction', 'Temp_Category']

for col in categorical_features:
    if col in X_test.columns and col in label_encoders:
        le = label_encoders[col]

        unique_test_values = set(X_test[col].astype(str).unique())
        known_classes = set(le.classes_)
        unknown_values = unique_test_values - known_classes

        if unknown_values:
            print(f"Warning: Unknown categories in {col}: {unknown_values}")
            most_frequent_class = le.classes_[0]  # Assumes first class is most frequent
            X_test[col] = X_test[col].astype(str).replace(list(unknown_values), most_frequent_class)

        X_test[col] = le.transform(X_test[col].astype(str))
        print(f" Encoded {col}")


 Encoded Type
 Encoded Season
 Encoded Type_Size_Interaction
 Encoded Holiday_Type_Interaction
 Encoded Temp_Category


In [36]:
missing_before = X_test.isnull().sum().sum()
X_test = X_test.fillna(0)

In [37]:
if set(X_test.columns) != set(feature_cols):
    missing_features = set(feature_cols) - set(X_test.columns)
    extra_features = set(X_test.columns) - set(feature_cols)

    if missing_features:
        print(f"Warning: Missing features in test data: {missing_features}")
        for feature in missing_features:
            X_test[feature] = 0
    if extra_features:
        print(f"Warning: Extra features in test data: {extra_features}")

    X_test = X_test[feature_cols]

print("Test data preparation completed")

Test data preparation completed


In [39]:
xgb_predictions = xgb_model.predict(X_test)
print(f"XGBoost: min={xgb_predictions.min():.2f}, max={xgb_predictions.max():.2f}, mean={xgb_predictions.mean():.2f}")

XGBoost: min=-8019.81, max=473360.78, mean=15708.55


In [40]:
rf_predictions = rf_model.predict(X_test)
print(f"Random Forest: min={rf_predictions.min():.2f}, max={rf_predictions.max():.2f}, mean={rf_predictions.mean():.2f}")

Random Forest: min=2.59, max=396870.24, mean=15612.31


In [41]:
lr_predictions = lr_model.predict(X_test)
print(f"Linear Regression: min={lr_predictions.min():.2f}, max={lr_predictions.max():.2f}, mean={lr_predictions.mean():.2f}")

Linear Regression: min=-19947.51, max=83872.74, mean=12535.78


In [42]:
weights = CONFIG['ensemble_weights']
ensemble_predictions = (
    weights['xgboost'] * xgb_predictions +
    weights['random_forest'] * rf_predictions +
    weights['linear_regression'] * lr_predictions
)

print(f"Ensemble prediction: min={ensemble_predictions.min():.2f}, max={ensemble_predictions.max():.2f}, mean={ensemble_predictions.mean():.2f}")

Ensemble prediction: min=-4099.77, max=359142.80, mean=15045.12


In [45]:
os.makedirs(CONFIG['output_path'], exist_ok=True)

results_df = pd.DataFrame({
    'Store': test['Store'] if 'Store' in test.columns else range(len(ensemble_predictions)),
    'Dept': test['Dept'] if 'Dept' in test.columns else 1,
    'Date': test['Date'] if 'Date' in test.columns else pd.Timestamp.now(),
    'Weekly_Sales_Pred_XGB': xgb_predictions,
    'Weekly_Sales_Pred_RF': rf_predictions,
    'Weekly_Sales_Pred_LR': lr_predictions,
    'Weekly_Sales_Pred_Ensemble': ensemble_predictions
})

In [47]:
submission_df = pd.DataFrame({
    'Id': test.index if 'Id' not in test.columns else test['Id'],
    'Weekly_Sales': ensemble_predictions
})

In [48]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"{CONFIG['output_path']}/detailed_predictions_{timestamp}.csv"
submission_file = f"{CONFIG['output_path']}/submission_{timestamp}.csv"

results_df.to_csv(results_file, index=False)
submission_df.to_csv(submission_file, index=False)

In [49]:
metadata = {
    'timestamp': timestamp,
    'num_predictions': len(ensemble_predictions),
    'ensemble_weights': CONFIG['ensemble_weights'],
    'prediction_stats': {
        'min': float(ensemble_predictions.min()),
        'max': float(ensemble_predictions.max()),
        'mean': float(ensemble_predictions.mean()),
        'std': float(ensemble_predictions.std())
    },
    'individual_model_stats': {
        'xgboost': {
            'min': float(xgb_predictions.min()),
            'max': float(xgb_predictions.max()),
            'mean': float(xgb_predictions.mean())
        },
        'random_forest': {
            'min': float(rf_predictions.min()),
            'max': float(rf_predictions.max()),
            'mean': float(rf_predictions.mean())
        },
        'linear_regression': {
            'min': float(lr_predictions.min()),
            'max': float(lr_predictions.max()),
            'mean': float(lr_predictions.mean())
        }
    }
}

metadata_file = f"{CONFIG['output_path']}/metadata_{timestamp}.json"
with open(metadata_file, 'w') as f:
    json.dump(metadata, f, indent=2)

In [51]:
print(f"Test samples processed: {len(ensemble_predictions)}")
print(f"Features used: {len(feature_cols)}")
print(f"Models used: 3 (XGBoost, Random Forest, Linear Regression)")
print(f"Ensemble weights: XGB={weights['xgboost']}, RF={weights['random_forest']}, LR={weights['linear_regression']}")
print(f"Ensemble prediction range: [{ensemble_predictions.min():.2f}, {ensemble_predictions.max():.2f}]")
print(f"Ensemble prediction mean: {ensemble_predictions.mean():.2f}")

Test samples processed: 115064
Features used: 34
Models used: 3 (XGBoost, Random Forest, Linear Regression)
Ensemble weights: XGB=0.5, RF=0.3, LR=0.2
Ensemble prediction range: [-4099.77, 359142.80]
Ensemble prediction mean: 15045.12
