In [None]:
import pandas as pd
import numpy as np

In [None]:
train_data = pd.read_csv("C:\\Users\\sjyot\\Downloads\\train.csv")
test_data = pd.read_csv("C:\\Users\\sjyot\\Downloads\\test.csv")


In [None]:
train_data['source'] = 'train'
test_data['source'] = 'test'

In [None]:
combined_data = pd.concat([train_data, test_data], ignore_index=True)

In [None]:
print(train_data.head(5))

In [None]:
print(test_data.head())

In [None]:
print(test_data.shape, train_data.shape, combined_data.shape)

In [None]:
combined_data.describe()

In [None]:
combined_data.apply(lambda x: len(x.unique()))


In [None]:
categorical_columns = [x for x in combined_data.dtypes.index if combined_data.dtypes[x]=='object']
categorical_columns = [x for x in categorical_columns if x not in ['Item_Identifier','Outlet_Identifier','source']]
print(categorical_columns)

In [None]:
for column in categorical_columns:

    print(combined_data[column].value_counts())

In [None]:
columns_with_nulls = combined_data.columns[combined_data.isnull().any()]
columns_with_nulls

In [None]:

item_avg_weight = (
    combined_data
    .pivot_table(values='Item_Weight', index='Item_Identifier')
    ['Item_Weight']
)
item_avg_weight

In [None]:
miss_bool = combined_data['Item_Weight'].isnull()

#Impute data and check #missing values before and after imputation to confirm
print('Orignal #missing: %d'% sum(miss_bool))
miss_bool


In [None]:
#item_avg_weight.loc[x, 'Item_Weight']
combined_data.loc[miss_bool, 'Item_Weight'] = (
    combined_data.loc[miss_bool, 'Item_Identifier'].map(item_avg_weight)
)


In [None]:
combined_data

In [None]:
miss_bool = combined_data['Item_Weight'].isnull()

#Impute data and check #missing values before and after imputation to confirm
print('Orignal #missing: %d'% sum(miss_bool))
miss_bool

In [None]:
combined_data.pivot_table(values='Item_Outlet_Sales',index='Outlet_Type')


In [None]:
#Determine average visibility of a product
#visibility_avg = combined_data.pivot_table(values='Item_Visibility', index='Item_Identifier')
visibility_avg = (
    combined_data.pivot_table(values='Item_Visibility', index='Item_Identifier')
    ['Item_Visibility']
)
#Impute 0 values with mean visibility of that product:
miss_bool = (combined_data['Item_Visibility'] == 0)

print ('Number of 0 values initially: %d'%sum(miss_bool))


In [None]:
combined_data.loc[miss_bool,'Item_Visibility'] = combined_data.loc[miss_bool,'Item_Identifier'].map(visibility_avg)
print ('Number of 0 values after modification: %d'%sum(combined_data['Item_Visibility'] == 0))

In [None]:
combined_data['Item_Type_Redefined'] = combined_data['Item_Identifier'].apply(lambda x: x[0:2])
#Rename them to more intuitive categories:
combined_data['Item_Type_Redefined'] = combined_data['Item_Type_Redefined'].map({'FD':'Food',
                                                             'NC':'Non-Consumable',
                                                             'DR':'Drinks'})
combined_data['Item_Type_Redefined'].value_counts()


In [None]:
combined_data['Item_Fat_Content'] = np.where(
    combined_data['Item_Type_Redefined'].eq("Non-Consumable"),
    "Non-Edible",
    combined_data['Item_Fat_Content']
)
combined_data['Item_Fat_Content'].value_counts()

In [None]:
#Merging similar fat content items Low Fat, LF low_fat
print(combined_data['Item_Fat_Content'].value_counts())

print ('\nModified Categories:')
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({'LF':'Low Fat',
                                                             'reg':'Regular',
                                                             'low fat':'Low Fat'})
print (combined_data['Item_Fat_Content'].value_counts())

In [None]:
#years of operation of a store
combined_data['Outlet_Years_Operation'] = 2013 - combined_data['Outlet_Establishment_Year']
combined_data['Outlet_Years_Operation'].describe()


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode Outlet separately
combined_data['Outlet'] = LabelEncoder().fit_transform(combined_data['Outlet_Identifier'])

# Columns to encode
var_mod = [
    'Item_Fat_Content', 'Outlet_Location_Type', 'Outlet_Size',
    'Item_Type_Redefined', 'Outlet_Type', 'Outlet'
]

# Fit separate encoders for each column
label_encoders = {}

for col in var_mod:
    label_encoder = LabelEncoder()
    combined_data[col] = label_encoder.fit_transform(combined_data[col])
    label_encoders[col] = label_encoder    # store encoder if needed later


In [None]:
# Columns to one-hot encode
cols_to_encode = [
    'Item_Fat_Content',
    'Outlet_Location_Type',
    'Outlet_Size',
    'Outlet_Type',
    'Item_Type_Redefined',
    'Outlet'
]

# Apply one-hot encoding
combined_data = pd.get_dummies(combined_data, columns=cols_to_encode, drop_first=True)


In [None]:
combined_data.dtypes

In [None]:
combined_data.head(5)

In [None]:
combined_data.columns

In [None]:
# Drop unused columns
columns_to_drop = ['Item_Type','Outlet_Establishment_Year', 'Item_Identifier','Outlet_Identifier']
combined_data.drop(columns_to_drop, axis=1, inplace=True)
#combined_data.drop(columns=columns_to_drop, inplace=True, axis=1)

# Split into train and test sets
train = combined_data[combined_data['source'] == "train"].copy()
test = combined_data[combined_data['source'] == "test"].copy()

# Clean up unnecessary columns
train.drop(columns=['source'], inplace=True)

# 'Item_Outlet_Sales' exists only in train, so drop safely only if present
test.drop(columns=['Item_Outlet_Sales', 'source'], errors='ignore', inplace=True)

# Export cleaned datasets
train.to_csv("train_modified.csv", index=False)
test.to_csv("test_modified.csv", index=False)


In [None]:
combined_data.isna().sum()

In [None]:

from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from itertools import product


In [None]:
#Load train and test date
train = pd.read_csv("train_modified.csv")
test = pd.read_csv("test_modified.csv")
test_copy = test_data.copy()

In [None]:
# Train split
X_train = train.drop("Item_Outlet_Sales", axis=1)
y_train = test["Item_Outlet_Sales"]
X_test = test.drop("Item_Outlet_Sales", axis=1)
y_test = train["Item_Outlet_Sales"]

print("Train Shape:", X_train.shape)
print("Test Shape :", test.shape)


In [None]:
#Finalizing Hyper parameter (learning rate, tree depth and iterations
param_grid = {
    "learning_rate": [0.1, 0.05, 0.03, 0.02],
    "depth": [4, 6, 8,9],
    "iterations": [800, 500, 1500,700]
}

In [None]:
def tune_catboost(X, y, param_grid):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    results = []

    for lr, depth, iters in product(
        param_grid["learning_rate"],
        param_grid["depth"],
        param_grid["iterations"]
    ):
        rmse_scores = []

        print(f"Testing: LR={lr}, Depth={depth}, Iter={iters}")

        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            model = CatBoostRegressor(
                learning_rate=lr,
                depth=depth,
                iterations=iters,
                loss_function='RMSE',
                verbose=False,
                random_state=42
            )

            model.fit(X_train, y_train)
            preds = model.predict(X_val)

            rmse = np.sqrt(mean_squared_error(y_val, preds))
            rmse_scores.append(rmse)

        avg_rmse = np.mean(rmse_scores)
        results.append((avg_rmse, lr, depth, iters))

        print(f"   ‚Üí Avg RMSE: {avg_rmse:.2f}\n")

    results.sort(key=lambda x: x[0])
    best_rmse, best_lr, best_depth, best_iters = results[0]

    print("\nüèÜ BEST PARAMETERS")
    print("----------------------------")
    print(f"Learning Rate : {best_lr}")
    print(f"Depth         : {best_depth}")
    print(f"Iterations    : {best_iters}")
    print(f"CV RMSE       : {best_rmse:.2f}")

    return best_lr, best_depth, best_iters

In [None]:

best_lr, best_depth, best_iters = tune_catboost(X_train, y_train, param_grid)

#Model Training
final_model = CatBoostRegressor(
    learning_rate=best_lr,
    depth=best_depth,
    iterations=best_iters,
    loss_function='RMSE',
    verbose=True,
    random_state=42
)

print("\nüöÄ Training final CatBoost model on FULL DATA...\n")
final_model.fit(X_train, y_train)

In [None]:
 # Model Testing
test_predictions = final_model.predict(test)
test_predictions = np.maximum(test_predictions, 0)
#final_submission = test_copy[["Item_Identifier", "Outlet_Identifier"]]
#final_submission[["Outlet_Identifier"]] = test_predictions
final_submission = pd.DataFrame()
final_submission["Item_Identifier"] = test_copy["Item_Identifier"]
final_submission["Outlet_Identifier"] = test_copy["Outlet_Identifier"]
final_submission["Item_Outlet_Sales"] = test_predictions


#test["Item_Outlet_Sales"] = test_predictions

final_submission.to_csv("final_result_submission.csv", index=False)
print("\nüìÅ Saved: final_submission.csv")

In [None]:
test_copy.shape

In [None]:
# ================================================================
# 8. Report Training Metrics on FULL DATA
# ================================================================
train_preds = final_model.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, train_preds))
r2 = r2_score(y_train, train_preds)
mae = mean_absolute_error(y_train, train_preds)
mape = np.mean(np.abs((y_train - train_preds) / y_train)) * 100

print("\n========== FINAL MODEL METRICS (Train Data) ==========")
print(f"RMSE : {rmse:.2f}")
print(f"R¬≤   : {r2:.4f}")
print(f"MAE  : {mae:.2f}")
print(f"MAPE : {mape:.2f}%")
print("======================================================")

In [None]:
# =======================
# 8. Evaluate on TRAIN Data
# =======================
train_preds = final_model.predict(X_train)

train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
train_r2 = r2_score(y_train, train_preds)
train_mae = mean_absolute_error(y_train, train_preds)
train_mape = np.mean(np.abs((y_train - train_preds) / y_train)) * 100


#Model Evaluation
test_preds = final_model.predict(X_test)

test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
test_r2 = r2_score(y_test, test_preds)
test_mae = mean_absolute_error(y_test, test_preds)
test_mape = np.mean(np.abs((y_test - test_preds) / y_test)) * 100

print("\n========== FINAL MODEL METRICS (TEST DATA) ==========")
print(f"RMSE : {test_rmse:.2f}")
print(f"R¬≤   : {test_r2:.4f}")
print(f"MAE  : {test_mae:.2f}")
print(f"MAPE : {test_mape:.2f}%")
print("======================================================")