# Black Friday Sales Prediction

In [1]:
# Import Linraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression

In [2]:
# Read data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

In [3]:
train.head(20)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227
6,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215
7,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,,15854
8,1000004,P0097242,M,46-50,7,B,2,1,1,16.0,,15686
9,1000005,P00274942,M,26-35,20,A,1,1,8,,,7871


In [4]:
train.shape

(550068, 12)

In [5]:
type(train.iloc[0].Age)

str

## Feature Engineering

In [6]:
test.head(10)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0
5,1000013,P00350442,M,46-50,1,C,3,1,2,3.0,15.0
6,1000013,P00155442,M,46-50,1,C,3,1,1,11.0,15.0
7,1000013,P0094542,M,46-50,1,C,3,1,2,4.0,9.0
8,1000015,P00161842,M,26-35,7,A,1,0,10,13.0,16.0
9,1000022,P00067942,M,18-25,15,A,4+,0,5,14.0,


In [7]:
# Encoding "Age" feature
le = LabelEncoder()
train['Age'] = le.fit_transform(train['Age'])
test['Age'] = le.transform(test['Age'])

# Also convert String Age values to int - might be useful in prediction as Age value may correspond to purchasing pattern
train['Age'] = train['Age'].astype(int)
test['Age'] = test['Age'].astype(int)

In [8]:
# Encoding "City_Category" feature
le = LabelEncoder()
train['City_Category'] = le.fit_transform(train['City_Category'])
test['City_Category'] = le.transform(test['City_Category'])

In [9]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0,10,0,2,0,3,,,8370
1,1000001,P00248942,F,0,10,0,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0,10,0,2,0,12,,,1422
3,1000001,P00085442,F,0,10,0,2,0,12,14.0,,1057
4,1000002,P00285442,M,6,16,2,4+,0,8,,,7969


In [10]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,4,7,1,2,1,1,11.0,
1,1000009,P00113442,M,2,17,2,0,0,3,5.0,
2,1000010,P00288442,F,3,1,1,4+,1,5,14.0,
3,1000010,P00145342,F,3,1,1,4+,1,4,9.0,
4,1000011,P00053842,F,2,1,2,1,0,4,5.0,12.0


In [11]:
# Handling "Stay_In_Current_City_Years" Feature

type(train.iloc[2].Stay_In_Current_City_Years)
type(train.iloc[1].Stay_In_Current_City_Years)
train.Stay_In_Current_City_Years.unique()
# Type of data is string. We need to handle values with '+' ('4+')

train['Stay_In_Current_City_Years'] = np.where(train['Stay_In_Current_City_Years'] == '4+', '4', train['Stay_In_Current_City_Years'])
test['Stay_In_Current_City_Years'] = np.where(test['Stay_In_Current_City_Years'] == '4+', '4', test['Stay_In_Current_City_Years'])

# Also converting string Stay_In_Current_City_Years values to int - same as Age feature
train['Stay_In_Current_City_Years'] = train['Stay_In_Current_City_Years'].astype(int)
test['Stay_In_Current_City_Years'] = test['Stay_In_Current_City_Years'].astype(int)

In [12]:
train.head(10)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0,10,0,2,0,3,,,8370
1,1000001,P00248942,F,0,10,0,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0,10,0,2,0,12,,,1422
3,1000001,P00085442,F,0,10,0,2,0,12,14.0,,1057
4,1000002,P00285442,M,6,16,2,4,0,8,,,7969
5,1000003,P00193542,M,2,15,0,3,0,1,2.0,,15227
6,1000004,P00184942,M,4,7,1,2,1,1,8.0,17.0,19215
7,1000004,P00346142,M,4,7,1,2,1,1,15.0,,15854
8,1000004,P0097242,M,4,7,1,2,1,1,16.0,,15686
9,1000005,P00274942,M,2,20,0,1,1,8,,,7871


In [13]:
type(train.iloc[8].Stay_In_Current_City_Years)

numpy.int64

In [14]:
# Handling Missing values
train.isnull().any()
test.isnull().any()

# Features Product_Category_2 and Product_Category_3 have missing values
train = train.fillna('999')
test = test.fillna('999')
train.isnull().any()

User_ID                       False
Product_ID                    False
Gender                        False
Age                           False
Occupation                    False
City_Category                 False
Stay_In_Current_City_Years    False
Marital_Status                False
Product_Category_1            False
Product_Category_2            False
Product_Category_3            False
Purchase                      False
dtype: bool

In [15]:
# Product_Category_1 has int values whereas 2 and 3 have float values
# Converting all to int

train['Product_Category_2'] = train['Product_Category_2'].astype(int)
train['Product_Category_3'] = train['Product_Category_3'].astype(int)

test['Product_Category_2'] = test['Product_Category_2'].astype(int)
test['Product_Category_3'] = test['Product_Category_3'].astype(int)

type(train.iloc[4].Product_Category_2)

numpy.int64

In [16]:
# Converting 'Gender' values to integer
le = LabelEncoder()
train['Gender'] = le.fit_transform(train['Gender'])
test['Gender'] = le.transform(test['Gender'])

In [17]:
# Converting 'Gender' values to integer
# le = LabelEncoder()
# uniq_train_Product_ID = train.Product_ID.unique()
# uniq_test_Product_ID = test.Product_ID.unique()

# train['Product_ID'] = le.fit_transform(train['Product_ID'])
# test['Product_ID'] = le.transform(test['Product_ID'])

In [18]:
# train.head()
# # test.head()

In [19]:
# # Creating Train and Test sets for modeling
# X = train.iloc[:, :-1]
# y = train.iloc[:, -1]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Modeling

In [22]:
# Model: Linear Regression
# lr_model = LinearRegression()
# lr_model.fit(X_train, y_train)
# lr_y_pred = lr_model.predict(X_test)

# score = mean_squared_error(y_test, lr_y_pred, squared=False)
# score