In [1]:
# 1. Importing necessary libraries for data preprocessing, modeling, and evaluation.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import joblib

In [2]:
# Preprocessing function to handle missing values, derive new features, and transform data.
def preprocess_data(data):
    data['Item_Weight'] = data['Item_Weight'].fillna(data.groupby('Item_Type')['Item_Weight'].transform('mean'))
    data['Outlet_Size'] = data['Outlet_Size'].fillna(data['Outlet_Size'].mode()[0])

    # Feature derivation
    data['Outlet_Age'] = 2024 - data['Outlet_Establishment_Year']
    data['Price_Per_Unit_Weight'] = data['Item_MRP'] / data['Item_Weight']
    
    # Simplifying Item_Fat_Content
    data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'})

    # Log Transformation for Item Visibility
    data['Item_Visibility_Log'] = np.log1p(data['Item_Visibility'])
    
    # MRP Categorization
    min_value = data['Item_MRP'].min()
    max_value = data['Item_MRP'].max()
    range_value = max_value - min_value
    data['MRP_Tier'] = data['Item_MRP'].apply(lambda x: 'Low' if x <= min_value + 0.33 * range_value else
                                              'Medium' if x <= min_value + 0.66 * range_value else 'High')

    return data

# Step 2: Loading data and splitting into training and testing sets.
data = pd.read_csv('C:/Users/HP/Desktop/infosys_springboard/final code/Train1.csv')
training_data, testing_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Preprocessing training and testing datasets.
training_data = preprocess_data(training_data)
testing_data = preprocess_data(testing_data)

In [3]:
data.head(1)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138


In [4]:

def encode_data(data, is_training=True, encoders=None):
    # feature groups
    numeric_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age', 'Price_Per_Unit_Weight']
    ordinal_features = ['Outlet_Size', 'MRP_Tier']
    nominal_features = ['Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Type']
    label_features = ['Item_Identifier', 'Outlet_Identifier']

    # Filtering the columns existing in the dataset
    numeric_features = [col for col in numeric_features if col in data.columns]
    ordinal_features = [col for col in ordinal_features if col in data.columns]
    nominal_features = [col for col in nominal_features if col in data.columns]
    label_features = [col for col in label_features if col in data.columns]

    if is_training:
        # Fitting encoders on training data
        encoders = {
            'ordinal': OrdinalEncoder().fit(data[ordinal_features]) if ordinal_features else None,
            'nominal': OneHotEncoder(sparse_output=False, drop='first').fit(data[nominal_features]) if nominal_features else None,
            'label': {col: LabelEncoder().fit(data[col]) for col in label_features},
            'scaler': StandardScaler().fit(data[numeric_features]) if numeric_features else None
        }

    # Applying transformations
    if numeric_features:
        data[numeric_features] = encoders['scaler'].transform(data[numeric_features])
    if ordinal_features:
        data[ordinal_features] = encoders['ordinal'].transform(data[ordinal_features])
    if nominal_features:
        nominal_encoded = encoders['nominal'].transform(data[nominal_features])
        nominal_cols = encoders['nominal'].get_feature_names_out(nominal_features)
        data = pd.concat([data.reset_index(drop=True), pd.DataFrame(nominal_encoded, columns=nominal_cols)], axis=1)
        data.drop(columns=nominal_features, inplace=True)

    # Label encode ID columns
    for label_feature in label_features:
        le = encoders['label'][label_feature]
        data[label_feature] = data[label_feature].map(
            lambda x: le.transform([x])[0] if x in le.classes_ else -1
        )
        
    # Saving encoders to reuse them for new data.
    joblib.dump(encoders['ordinal'], 'ordinal_encoder.pkl')
    joblib.dump(encoders['nominal'], 'onehot_encoder.pkl')
    joblib.dump(encoders['scaler'], 'standard_scaler.pkl')

    # Save label encoders for ID features
    for label_feature in label_features:
        joblib.dump(encoders['label'][label_feature], f'{label_feature}_label_encoder.pkl')

    return (data, encoders) if is_training else data

# Step 4: Encode training and testing data
training_data, encoders = encode_data(training_data, is_training=True)  # Encoding on training data

In [5]:
# Loading encoders and encoding testing data.   
ordinal_encoder = joblib.load('ordinal_encoder.pkl')
onehot_encoder = joblib.load('onehot_encoder.pkl')
scaler = joblib.load('standard_scaler.pkl')

item_identifier_encoder = joblib.load('Item_Identifier_label_encoder.pkl')
outlet_identifier_encoder = joblib.load('Outlet_Identifier_label_encoder.pkl')

testing_data = encode_data(testing_data, is_training=False, encoders=encoders)


In [6]:
training_data.dtypes

Item_Identifier                      int32
Item_Weight                        float64
Item_Visibility                    float64
Item_MRP                           float64
Outlet_Identifier                    int32
Outlet_Establishment_Year            int64
Outlet_Size                        float64
Item_Outlet_Sales                  float64
Outlet_Age                         float64
Price_Per_Unit_Weight              float64
Item_Visibility_Log                float64
MRP_Tier                           float64
Item_Fat_Content_Regular           float64
Outlet_Location_Type_Tier 2        float64
Outlet_Location_Type_Tier 3        float64
Outlet_Type_Supermarket Type1      float64
Outlet_Type_Supermarket Type2      float64
Outlet_Type_Supermarket Type3      float64
Item_Type_Breads                   float64
Item_Type_Breakfast                float64
Item_Type_Canned                   float64
Item_Type_Dairy                    float64
Item_Type_Frozen Foods             float64
Item_Type_F

In [7]:
#Splitting datasets into features (X) and target variable (y).

X_train = training_data.drop('Item_Outlet_Sales', axis=1)
y_train = training_data['Item_Outlet_Sales']
X_test = testing_data.drop('Item_Outlet_Sales', axis=1)
y_test = testing_data['Item_Outlet_Sales']

# Training models and evaluating performance.
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor()
}

results = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    train_r2 = r2_score(y_train, model.predict(X_train))
    test_r2 = r2_score(y_test, model.predict(X_test))
    results.append({'Model': model_name, 'Train R²': train_r2, 'Test R²': test_r2})

results_df = pd.DataFrame(results)
print(results_df)

               Model  Train R²   Test R²
0  Linear Regression  0.559156  0.577484
1      Random Forest  0.937691  0.567058
2            XGBoost  0.884042  0.507696
3              Lasso  0.558769  0.578327
4              Ridge  0.559019  0.577875
5  Gradient Boosting  0.632934  0.605812
6           AdaBoost  0.447007  0.430920


In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
import pandas as pd

# Define a reduced parameter grid
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100],
        'max_depth': [5, 10],
        'min_samples_split': [2, 5]
    },
}

# Perform RandomizedSearchCV for Random Forest
model_name = 'Random Forest'
model = models[model_name]
param_grid = param_grids[model_name]

grid_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=10,
    scoring='r2',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Get results
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
test_r2 = r2_score(y_test, best_model.predict(X_test))

print(f"Best Parameters: {best_params}")
print(f"Test R²: {test_r2}")




Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 5}
Test R²: 0.6196403281174567


In [10]:
# Best hyperparameters from the GridSearchCV results
best_params = {
    'Linear Regression': {},
    'Random Forest': {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2},
    'XGBoost': {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100},
    'Lasso': {'alpha': 1, 'max_iter': 1000},
    'Ridge': {'alpha': 0.01},
    'Gradient Boosting': {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100},
    'AdaBoost': {'learning_rate': 0.1, 'n_estimators': 50}
}

# Initialize models with the best hyperparameters
models_with_best_params = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=2),
    'XGBoost': XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=100),
    'Lasso': Lasso(alpha=1, max_iter=1000),
    'Ridge': Ridge(alpha=0.01),
    'Gradient Boosting': GradientBoostingRegressor(learning_rate=0.1, max_depth=3, n_estimators=100),
    'AdaBoost': AdaBoostRegressor(learning_rate=0.1, n_estimators=50)
}

# Train and evaluate the models
r2_results = []

for model_name, model in models_with_best_params.items():
    model.fit(X_train, y_train)
    train_r2 = r2_score(y_train, model.predict(X_train))
    test_r2 = r2_score(y_test, model.predict(X_test))
    r2_results.append({'Model': model_name, 'Train R²': train_r2, 'Test R²': test_r2})

# Convert results to a DataFrame
r2_results_df = pd.DataFrame(r2_results)

# Display the results
print(r2_results_df)

               Model  Train R²   Test R²
0  Linear Regression  0.559156  0.577484
1      Random Forest  0.715427  0.607136
2            XGBoost  0.625660  0.606636
3              Lasso  0.558769  0.578327
4              Ridge  0.559138  0.577704
5  Gradient Boosting  0.632934  0.606135
6           AdaBoost  0.507759  0.503853


In [12]:
import joblib

# Save the trained XGBoost model
xgb_model = models_with_best_params['XGBoost']
joblib.dump(xgb_model, 'xgboost_model.pkl')

print("XGBoost model saved successfully!")


XGBoost model saved successfully!
