In [1]:
import pandas as pd

# Upload the dataset using Colab's built-in function
from google.colab import files
uploaded = files.upload()


Saving test_big_mart.csv to test_big_mart.csv
Saving train_big_mart.csv to train_big_mart.csv


In [23]:
train_df = pd.read_csv('train_big_mart.csv')
test_df = pd.read_csv('test_big_mart.csv')

In [24]:
train_df.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [25]:
print(train_df.isna().sum())
print(test_df.isna().sum())


Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64
Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64


In [26]:
import numpy as np
import pandas as pd  # Don't forget to import pandas
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

# Imputation

# Exclude the target column 'Item_Outlet_Sales' when listing numerical columns
numerical_cols = [col for col in train_df.columns if (train_df[col].dtype == 'float64' or train_df[col].dtype == 'int64') and col != 'Item_Outlet_Sales']
skewness = train_df[numerical_cols].skew()
# Save the original Item_Identifier and Outlet_Identifier
original_test_identifiers = test_df[['Item_Identifier', 'Outlet_Identifier']].copy()

# Impute numerical columns
for col in numerical_cols:
    if skewness[col] < -0.5 or skewness[col] > 0.5:
        train_df[col].fillna(train_df[col].median(), inplace=True)
        test_df[col].fillna(train_df[col].median(), inplace=True)  # Use the median from the training set
    else:
        train_df[col].fillna(train_df[col].mean(), inplace=True)
        test_df[col].fillna(train_df[col].mean(), inplace=True)  # Use the mean from the training set

# Exclude the target column 'Item_Outlet_Sales' when listing categorical columns
categorical_cols = [col for col in train_df.columns if train_df[col].dtype == 'object' and col != 'Item_Outlet_Sales']

# Impute categorical columns
for col in categorical_cols:
    train_df[col].fillna(train_df[col].mode()[0], inplace=True)
    test_df[col].fillna(train_df[col].mode()[0], inplace=True)  # Use the mode from the training set

# Outlier Handling
def cap_outliers_using_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].apply(lambda x: upper_bound if x > upper_bound else x)
    df[column] = df[column].apply(lambda x: lower_bound if x < lower_bound else x)
    return df

for col in numerical_cols:
    train_df = cap_outliers_using_iqr(train_df, col)
    test_df[col] = test_df[col].apply(lambda x: train_df[col].quantile(0.25) - 1.5 * (train_df[col].quantile(0.75) - train_df[col].quantile(0.25)) if x < train_df[col].quantile(0.25) - 1.5 * (train_df[col].quantile(0.75) - train_df[col].quantile(0.25)) else x)
    test_df[col] = test_df[col].apply(lambda x: train_df[col].quantile(0.75) + 1.5 * (train_df[col].quantile(0.75) - train_df[col].quantile(0.25)) if x > train_df[col].quantile(0.75) + 1.5 * (train_df[col].quantile(0.75) - train_df[col].quantile(0.25)) else x)

# Encoding
label_encoders = {}
one_hot_encoders = {}

label_encode_cols = [col for col in train_df.columns if train_df[col].dtype == 'object' and train_df[col].nunique() <= 3]
one_hot_encode_cols = [col for col in train_df.columns if train_df[col].dtype == 'object' and train_df[col].nunique() > 3]

for col in label_encode_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    if col in test_df.columns:
        test_df[col] = test_df[col].map(lambda s: s if s in le.classes_ else np.nan)
        test_df[col].fillna(test_df[col].mode()[0], inplace=True)
        test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le

ohe = OneHotEncoder(drop='first', sparse=False)
if one_hot_encode_cols:
    train_encoded = ohe.fit_transform(train_df[one_hot_encode_cols])
    train_encoded_df = pd.DataFrame(train_encoded, columns=ohe.get_feature_names_out(one_hot_encode_cols))
    test_encoded = ohe.transform(test_df[one_hot_encode_cols])
    test_encoded_df = pd.DataFrame(test_encoded, columns=ohe.get_feature_names_out(one_hot_encode_cols))

    train_df.drop(one_hot_encode_cols, axis=1, inplace=True)
    train_df = pd.concat([train_df, train_encoded_df], axis=1)

    test_df.drop(one_hot_encode_cols, axis=1, inplace=True)
    test_df = pd.concat([test_df, test_encoded_df], axis=1)

    one_hot_encoders = ohe

# Scaling
scaler = MinMaxScaler()

feature_columns = [col for col in train_df.columns if col != 'Item_Outlet_Sales']

train_df_scaled = train_df.copy()
train_df_scaled[feature_columns] = scaler.fit_transform(train_df[feature_columns])

test_df_scaled = test_df.copy()
test_df_scaled[feature_columns] = scaler.transform(test_df[feature_columns])




In [27]:
test_df_scaled.head()


Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,Item_Identifier_DRB13,...,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,0.964275,0.0386,0.325012,0.583333,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.222983,0.196081,0.237819,0.916667,0.5,0.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.598095,0.50809,0.893316,0.541667,0.5,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.164335,0.078521,0.525233,0.916667,0.5,0.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.494352,0.605163,0.861381,0.0,0.5,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Assume train_df_scaled and test_df_scaled are your scaled training and test data
X = train_df_scaled.drop('Item_Outlet_Sales', axis=1)
y = train_df_scaled['Item_Outlet_Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize tracking variables
best_model = None
best_rmse = float('inf')
best_params = None

# List to store model information
model_info = []


In [43]:
# Random Forest
for n in [50, 100, 200]:
    for depth in [3, 5, 7]:
        model_name = f"RandomForest, n_estimators={n}, max_depth={depth}"
        model = RandomForestRegressor(n_estimators=n, max_depth=depth)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)

        model_info.append({
                'Model': model_name,
                'RMSE': rmse,
                'Parameters': model.get_params()
            })

        if rmse < best_rmse:
            best_model = model_name
            best_rmse = rmse
            best_params = model.get_params()

print(f"Best Model: {best_model}, Best RMSE: {best_rmse}, Best Parameters: {best_params}")

Best Model: RandomForest, n_estimators=50, max_depth=5, Best RMSE: 1017.3208050397591, Best Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 5, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Best Model: Random Forest, n_estimators=450, max_depth=3, Best RMSE: 1022.9723429928218, Best Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 7, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [44]:

# Gradient Boosting
for depth in [3, 5, 7]:
    model_name = f"GradientBoosting, max_depth={depth}"  # Correcting model name
    model = GradientBoostingRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"RMSE for {model_name}: {rmse}")  # Print RMSE for each depth value

    model_info.append({
        'Model': model_name,
        'RMSE': rmse,
        'Parameters': model.get_params()
    })

    if rmse < best_rmse:
        best_model = model_name
        best_rmse = rmse
        best_params = model.get_params()

print(f"Best Model: {best_model}, Best RMSE: {best_rmse}, Best Parameters: {best_params}")

RMSE for GradientBoosting, max_depth=3: 1046.0803543305815
RMSE for GradientBoosting, max_depth=5: 1051.8950557637243
RMSE for GradientBoosting, max_depth=7: 1065.2716816589204
Best Model: RandomForest, n_estimators=50, max_depth=5, Best RMSE: 1017.3208050397591, Best Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 5, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [45]:
# AdaBoost
for n in [50, 100, 200]:
    model_name = f"AdaBoost, n_estimators={n}"  # Set the model name
    model = AdaBoostRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f"RMSE for {model_name}: {rmse}")  # Print RMSE for each 'n'

    model_info.append({
        'Model': model_name,
        'RMSE': rmse,
        'Parameters': model.get_params()
    })

    if rmse < best_rmse:
        best_model = model_name
        best_rmse = rmse
        best_params = model.get_params()

print(f"Best Model: {best_model}, Best RMSE: {best_rmse}, Best Parameters: {best_params}")


RMSE for AdaBoost, n_estimators=50: 1240.0870865634479
RMSE for AdaBoost, n_estimators=100: 1229.766838062141
RMSE for AdaBoost, n_estimators=200: 1223.3517075066716
Best Model: RandomForest, n_estimators=50, max_depth=5, Best RMSE: 1017.3208050397591, Best Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 5, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [46]:
# Bagging
for n in [50, 100, 200]:
    model_name = f"Bagging, n_estimators={n}"  # Define the model_name
    model = BaggingRegressor(n_estimators=n)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    print(f"RMSE for {model_name}: {rmse}")  # Print RMSE for each 'n'

    model_info.append({
        'Model': model_name,
        'RMSE': rmse,
        'Parameters': model.get_params()
    })

    if rmse < best_rmse:
        best_model = model_name
        best_rmse = rmse
        best_params = model.get_params()

print(f"Best Model: {best_model}, Best RMSE: {best_rmse}, Best Parameters: {best_params}")


RMSE for Bagging, n_estimators=50: 1105.6149166277835
RMSE for Bagging, n_estimators=100: 1093.9467618008339
RMSE for Bagging, n_estimators=200: 1094.7063840830936
Best Model: RandomForest, n_estimators=50, max_depth=5, Best RMSE: 1017.3208050397591, Best Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 5, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [47]:
# Extra Trees
for n in [50, 100, 200]:
    for depth in [3, 5, 7]:
        model_name = f"Extra Trees, n_estimators={n}, max_depth={depth}"  # Define model_name
        model = ExtraTreesRegressor(n_estimators=n, max_depth=depth)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)

        print(f"RMSE for {model_name}: {rmse}")  # Print RMSE for each 'n' and 'depth'

        model_info.append({
            'Model': model_name,
            'RMSE': rmse,
            'Parameters': model.get_params()
        })

        if rmse < best_rmse:
            best_model = model_name
            best_rmse = rmse
            best_params = model.get_params()

print(f"Best Model: {best_model}, Best RMSE: {best_rmse}, Best Parameters: {best_params}")


RMSE for Extra Trees, n_estimators=50, max_depth=3: 1147.401757356192
RMSE for Extra Trees, n_estimators=50, max_depth=5: 1021.6750665146596
RMSE for Extra Trees, n_estimators=50, max_depth=7: 1029.3749869521944
RMSE for Extra Trees, n_estimators=100, max_depth=3: 1148.3496782130453
RMSE for Extra Trees, n_estimators=100, max_depth=5: 1020.9308993258999
RMSE for Extra Trees, n_estimators=100, max_depth=7: 1025.518918793572
RMSE for Extra Trees, n_estimators=200, max_depth=3: 1145.8973951725875
RMSE for Extra Trees, n_estimators=200, max_depth=5: 1020.3133763963663
RMSE for Extra Trees, n_estimators=200, max_depth=7: 1024.8181223520585
Best Model: RandomForest, n_estimators=50, max_depth=5, Best RMSE: 1017.3208050397591, Best Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 5, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf'

In [48]:
# Linear Regression
model_name = "Linear Regression"
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)

model_info.append({
    'Model': model_name,
    'RMSE': rmse,
    'Parameters': model.get_params()
})

if rmse < best_rmse:
    best_model = model_name
    best_rmse = rmse
    best_params = model.get_params()

print(f"Best Model: {best_model}, Best RMSE: {best_rmse}, Best Parameters: {best_params}")


Best Model: RandomForest, n_estimators=50, max_depth=5, Best RMSE: 1017.3208050397591, Best Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 5, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [49]:

# KNN
for k in [3, 5, 7]:
    model_name = f"KNN, n_neighbors={k}"
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    model_info.append({
        'Model': model_name,
        'RMSE': rmse,
        'Parameters': model.get_params()
    })

    if rmse < best_rmse:
        best_model = model_name
        best_rmse = rmse
        best_params = model.get_params()

print(f"Best Model: {best_model}, Best RMSE: {best_rmse}, Best Parameters: {best_params}")


Best Model: RandomForest, n_estimators=50, max_depth=5, Best RMSE: 1017.3208050397591, Best Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 5, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [50]:

# SVM
for C in [0.1, 1, 10]:
    model_name = f"SVM, C={C}"
    model = SVR(C=C)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    model_info.append({
        'Model': model_name,
        'RMSE': rmse,
        'Parameters': model.get_params()
    })

    if rmse < best_rmse:
        best_model = model_name
        best_rmse = rmse
        best_params = model.get_params()

# Print the best model's information
print(f"Best Model: {best_model}, Best RMSE: {best_rmse}, Best Parameters: {best_params}")

Best Model: RandomForest, n_estimators=50, max_depth=5, Best RMSE: 1017.3208050397591, Best Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 5, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [54]:
best_model_info = min(model_info, key=lambda x: x['RMSE'])
best_model_info

{'Model': 'RandomForest, n_estimators=50, max_depth=5',
 'RMSE': 1017.3208050397591,
 'Parameters': {'bootstrap': True,
  'ccp_alpha': 0.0,
  'criterion': 'squared_error',
  'max_depth': 5,
  'max_features': 1.0,
  'max_leaf_nodes': None,
  'max_samples': None,
  'min_impurity_decrease': 0.0,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'min_weight_fraction_leaf': 0.0,
  'n_estimators': 50,
  'n_jobs': None,
  'oob_score': False,
  'random_state': None,
  'verbose': 0,
  'warm_start': False}}

In [55]:
# Extract best parameters from best_model_info
best_params = best_model_info['Parameters']

# Create and fit the model with the best parameters
best_model = RandomForestRegressor(**best_params)
best_model.fit(X_train, y_train)

In [56]:
# Make predictions on the test data
test_predictions = best_model.predict(test_df_scaled)


In [57]:
# Add the predictions to the original identifiers DataFrame
original_test_identifiers['Item_Outlet_Sales'] = test_predictions

# Create the output CSV
output = original_test_identifiers[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']]
output.to_csv('predictions.csv', index=False)


In [58]:
from google.colab import files
files.download('predictions.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>