**Question:** Predicting Shopping Mall Sales. You will have to create a model to predict
revenue. Identify the model with the best params. Target Column -
Revenue. Please note: Visualisation is mandatory. You will receive 0 marks
if you do not add visualisation.

Data Link: https://github.com/edyoda/data-science-complete-tutorial/blob/master/Data/Shopping_Revenue.csv

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder,MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score,KFold
from sklearn.neighbors import KNeighborsRegressor

In [None]:
df = pd.read_csv(r'C:\Users\User\Desktop\Data Science Project\shopping_revenue.csv')

df.head()

In [None]:
df.drop("Id",axis=1, inplace=True) # Column Id is just Index

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

From column Open date,we can get the established year

In [None]:
df['Established Year'] = 2023 - df['Open Date'].str.split('/',expand=True)[2].apply(int)
df.drop('Open Date',axis=1,inplace=True)

In [None]:
df.info()

In [None]:
df.isnull().sum().sort_values(ascending=False).head(5)

Column P6 & P7 has missing values, as dataset is alredy less & both the columns are not that import for our predictions, we can drop the columns.

In [None]:
df.drop(['P6','P7'],axis=1,inplace=True)

Let's see their correaltions

In [None]:
correlation = df.corr()
correlation

In [None]:
plt.figure(figsize = (16,10)) 

mask = np.zeros_like(correlation) # to see the correalation value once
mask[np.triu_indices_from(mask, 1)] = True

sns.heatmap(correlation, mask = mask, fmt='.2f', cmap='coolwarm')

In [None]:
## Categorical data analysis
city_count = df['City'].nunique()
city_group_count = df['City Group'].nunique()
type_count = df['Type'].nunique()

print(f'No of distinct cities: {city_count}')
print(f'No of distinct city groups: {city_group_count}')
print(f'No of distinct types : {type_count}')

Based on above information, we drop 'City' column as many of the cities have only one data point which doesn't really help us at all. The we analyse Sales Revenue based on 'City Group' and 'Type' columns.

In [None]:
df.drop(['City'],axis=1,inplace=True)

In [None]:
df.groupby(['City Group']).agg({'revenue':'mean'}).plot(kind='bar')

As we can see here, Big Cities has higher revenue

In [None]:
df.groupby('Type').agg({'revenue':'mean'}).plot(kind='bar')

As we can see here, FC' has higher revenue followed by 'IL' and then 'DT'

In [None]:
df.groupby(['City Group','Type']).agg({'revenue':'mean'}).plot(kind='bar')

First of all, there s no type 'DT' in case of 'other' cities. And all types earn more revenue in 'big cities'. Therefore, we must involve these categorical variables as partof our model.

In [None]:
type = pd.get_dummies(df['Type'],drop_first=True)
type

In [None]:
city=pd.get_dummies(df['City Group'])['Big Cities']
city

In [None]:
df=pd.concat([df,city,type],axis=1)
df.drop(['Type','City Group'],axis=1,inplace=True)

In [None]:
df.sample(5)

In [None]:
target='revenue'
features=df.drop(target,axis=1).columns

X=df[features]
y=df[target]

In [None]:
X.head()

In [None]:
y.head()

Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
print(f'feature training dataset shape: {X_train.shape}')
print(f'feature test dataset shape: {X_test.shape}')
print(f'output training dataset shape: {y_train.shape}')
print(f'output test dataset shape: {y_test.shape}')

In [None]:
## Baseline
y_baseline=y_train.mean()
print('Baseline score = {}'.format(mean_absolute_error(y_train,([y_baseline]*len(y_train)))))

Models

1. LinearRegression

In [None]:
LR_model = LinearRegression()

In [None]:
scores=[]
for i in range(1,40):
    rfe = RFE(LR_model, n_features_to_select=i)             
    rfe.fit(X_train, y_train)
    cols=X_train.columns[rfe.support_]
    X_train_temp=X_train[cols]
    X_test_temp=X_test[cols]
    LR_model.fit(X_train_temp,y_train)
    scores.append((i,mean_absolute_error(LR_model.predict(X_train_temp),y_train),mean_absolute_error(LR_model.predict(X_test_temp),y_test)))
scores.sort(key=lambda x: x[1])
scores

In [None]:
rfe = RFE(LR_model, n_features_to_select=25)             
rfe.fit(X_train, y_train)
cols=X_train.columns[rfe.support_]
X_train_temp=X_train[cols]
X_test_temp=X_test[cols]
LR_model.fit(X_train_temp,y_train)

In [None]:
mean_absolute_error(LR_model.predict(X_test_temp),y_test)

2. SVR

In [None]:
print(f'feature training dataset shape: {X_train.shape}')
print(f'feature test dataset shape: {X_test.shape}')
print(f'output training dataset shape: {y_train.shape}')
print(f'output test dataset shape: {y_test.shape}')

In [None]:
m2=Ridge()
scores=[]
for i in range(1,40):
    rfe = RFE(m2, n_features_to_select=i)             
    rfe.fit(X_train, y_train)
    cols=X_train.columns[rfe.support_]
    X_train_temp=X_train[cols]
    X_test_temp=X_test[cols]
    m2.fit(X_train_temp,y_train)
    scores.append((i,mean_absolute_error(m2.predict(X_train_temp),y_train),mean_absolute_error(m2.predict(X_test_temp),y_test)))
scores.sort(key=lambda x: x[1])
scores

In [None]:
rfe = RFE(Ridge(), n_features_to_select=26)             
rfe.fit(X_train, y_train)
cols=X_train.columns[rfe.support_]
X_train_temp=X_train[cols]
X_test_temp=X_test[cols]
m2.fit(X_train_temp,y_train)

In [None]:
mean_absolute_error(m2.predict(X_test_temp),y_test)

3. Decision tree

In [None]:
DT_model = DecisionTreeRegressor(random_state=42)

In [None]:
params = {'max_depth': range(1, 25)}
params

In [None]:
tree = GridSearchCV(DT_model, params, cv=5, verbose=True)
tree.fit(X_train, y_train)

In [None]:
mean_absolute_error(tree.predict(X_train),y_train)

In [None]:
mean_absolute_error(tree.predict(X_test),y_test)

4. Random Forest Regressor

In [None]:
RFR_model = RandomForestRegressor()
RFR_model

In [None]:
params={'n_estimators':range(10,100,10),'max_depth':range(10,21)}

In [None]:
rf= GridSearchCV(RFR_model, params, cv=5, verbose=True)
rf.fit(X_train, y_train)

In [None]:
mean_absolute_error(rf.predict(X_train),y_train)

In [None]:
mean_absolute_error(rf.predict(X_test),y_test)

4. Gradient Descent Regressor

In [None]:

GDR_model = GradientBoostingRegressor()
GDR_model

In [None]:
params={'n_estimators':range(10,100,10),'max_depth':range(10,21),'learning_rate': [0.01,0.05,0.1,0.2,0.3,0.4,0.5]}

In [None]:
gb = GridSearchCV(GDR_model, params, cv=5, verbose=True)
gb.fit(X_train, y_train)

In [None]:

mean_absolute_error(gb.predict(X_train),y_train)

In [None]:
mean_absolute_error(gb.predict(X_test),y_test)

As we can see, Out of all the models used, Random Forest is giving best result.

In [None]:
gb.best_estimator_

In [None]:

best_model=GradientBoostingRegressor(learning_rate=0.01, max_depth=20, n_estimators=20)

In [None]:
best_model.fit(X_train,y_train)

In [None]:
feat_imp=best_model.feature_importances_
features=X_train.columns
importances=pd.Series(feat_imp,index=features).sort_values()

In [None]:

## best features
importances.tail(10).plot(kind='barh')