# Introduction

A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month.
<br>The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.<br>

Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.

# Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Loading Dataset

In [None]:
df = pd.read_csv("/kaggle/input/black-friday-sales-prediction/train_oSwQCTC (1)/train.csv")
df_test = pd.read_csv("/kaggle/input/black-friday-sales-prediction/test_HujdGe7 (1)/test.csv")

# Data Info

In [None]:
#display some samples of the data
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='object').T

In [None]:
df.shape

In [None]:
#checking nulls
df.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
#checking duplicated rows
df.duplicated().sum()

# Dara Preprocessing

In [None]:
#split categorical and numerical data into two objects
df_cat = df.select_dtypes(include='object')
df_num = df.select_dtypes(exclude='object')

In [None]:
#print categorical columns names
cat_names = df_cat.columns
cat_names

In [None]:
#print all value counts of categorical data
for i in cat_names:
  print(df[i].value_counts())
  print('-'*100)

In [None]:
#the number of nans in Product_Category_3 column is very high so we drop the entire column
df.drop(columns='Product_Category_3', inplace=True)
df_test.drop(columns='Product_Category_3', inplace=True)

In [None]:
#display the distribution of Product_Category_2
sns.displot(df['Product_Category_2'], kde=True, bins=15)

In [None]:
#Fill the nans in Product_Category_2 with median
df['Product_Category_2'] = df['Product_Category_2'].fillna(df['Product_Category_2'].median())
df_test['Product_Category_2'] = df_test['Product_Category_2'].fillna(df_test['Product_Category_2'].median())

In [None]:
#ensuring no null values in the data
df.isnull().values.any()

In [None]:
df_test.isnull().values.any()

# Data Visualization

In [None]:
sns.countplot(x= df['Gender'])

In [None]:
sns.countplot(x= df['City_Category'])

In [None]:
sns.countplot(x= df['Stay_In_Current_City_Years'])

In [None]:
sns.countplot(x= df['Age'])

In [None]:
#droping useless columns
df = df.drop(columns=['User_ID', 'Product_ID'], axis=1)
df_test = df_test.drop(columns=['User_ID', 'Product_ID'], axis=1)

In [None]:
#mapping categorical data
Age_mapping = {
    '0-17' : 1,
    '18-25': 2,
    '26-35': 3,
    '36-45': 4,
    '46-50': 5,
    '51-55': 6,
    '55+'  : 7
}
Stay_In_Current_City_Years_mapping = {
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4+': 4
}

In [None]:
df['Age'] = df['Age'].map(Age_mapping)
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].map(Stay_In_Current_City_Years_mapping)
df['Gender'] = df['Gender'].map({'F':0, 'M':1})

df_test['Age'] = df_test['Age'].map(Age_mapping)
df_test['Stay_In_Current_City_Years'] = df_test['Stay_In_Current_City_Years'].map(Stay_In_Current_City_Years_mapping)
df_test['Gender'] = df_test['Gender'].map({'F':0, 'M':1})

In [None]:
CC_df = pd.get_dummies(df['City_Category'], prefix='City_Category', drop_first=True).astype(int)
CC_df_test = pd.get_dummies(df_test['City_Category'], prefix='City_Category', drop_first=True).astype(int)

In [None]:
CC_df.head()

In [None]:
df = pd.concat([df, CC_df], axis=1)
df.drop(columns='City_Category', inplace=True)
df.reset_index(drop= True, inplace=True)

df_test = pd.concat([df_test, CC_df_test], axis=1)
df_test.drop(columns='City_Category', inplace=True)
df_test.reset_index(drop= True, inplace=True)

In [None]:
df.head()

In [None]:
#ensuring no more object datatype
df.info()

In [None]:
#New feature extracting
df['Stay_In_Current_City_Years_Per_Age'] = df['Stay_In_Current_City_Years'] / df['Age']

df_test['Stay_In_Current_City_Years_Per_Age'] = df_test['Stay_In_Current_City_Years'] / df_test['Age']

In [None]:
sns.pairplot(df, x_vars=['Product_Category_1', 'Product_Category_2'], y_vars='Purchase', size=4 , aspect=0.7, kind='scatter')

In [None]:
#view data correlation
df.corr()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True)

In [None]:
df_corr= df.corr()
target = abs(df_corr['Purchase'])
print(target.sort_values(ascending=False),"\n")

In [None]:
#dropping unnecessary column
df = df.drop(columns=['Stay_In_Current_City_Years'], axis=1)

df_test = df_test.drop(columns=['Stay_In_Current_City_Years'], axis=1)

# View Outliers

In [None]:
df.boxplot(column='Occupation')

In [None]:
df.boxplot(column='Product_Category_1')

In [None]:
df.boxplot(column='Product_Category_2')

# Data Split

In [None]:
X = df.drop(columns=['Purchase'], axis=1)
y = df['Purchase']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalization

In [None]:
def normalize(x):
  x_min = x.min()
  x_max = x.max()
  x_norm = (x - x_min) / (x_max - x_min)
  return x_norm

X_train = normalize(X_train)
X_test = normalize(X_test)

df_test = normalize(df_test)

# Models Tuning 

### Tuning RandomForestRegressor

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

### Tuning XGBRegressor

In [None]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

### Tuning VotingRegressor

In [None]:
vr = VotingRegressor([('rf', rf), ('xgb', xgb)])
vr.fit(X_train, y_train)
y_pred = vr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Saving the Best Model

In [None]:
best_model = vr
joblib.dump(best_model, 'vr_BlackFridaySales.pkl')

# Predict test data for submission

In [None]:
# Make predictions on the normalized test data
df_test['Purchase'] = vr.predict(df_test)

In [None]:
sample_submission = pd.read_csv('/kaggle/input/black-friday-sales-prediction/sample_submission_V9Inaty.csv')

In [None]:
sample_submission['Purchase'] = df_test['Purchase'] 

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('Submission.csv')