In [None]:
#Reading data
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import seaborn as sns 
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder 

#Data spliting
from sklearn.model_selection import train_test_split


# Reading data 


In [None]:
data=pd.read_csv('/kaggle/input/supermarket-sales/supermarket_sales - Sheet1.csv')

#head() for display the first 5 rows  
data.head().style.set_properties(**{'background-color': '#873600',
                                          'color': '#E2EEF3'}) #for colored output

<a id="5"></a>
# Statistical information 

In [None]:
data.shape

In [None]:
data.size

In [None]:
data.info() # for empty and type of values

In [None]:
#for statistical info
data.describe().style.background_gradient(cmap='Oranges') #for colored output

In [None]:
#for statistical info including string values
data.describe(include='O').style.set_properties(**{'background-color': '#873600',
                                                      'color': '#E2EEF3'})  

In [None]:
data.columns #for show names of columns

In [None]:
data.nunique() #for number of values of columns

# Exploratory data Analysis

In [None]:
data[['Product line','Quantity']].groupby(['Product line']).mean().sort_values(by='Quantity',ascending=False).style.background_gradient(cmap='Oranges')

In [None]:
fig=px.histogram(data,x='Product line',y='Quantity',
                color_discrete_sequence=['#6E2C00'],
                text_auto=True)


fig.update_layout(title='<b>The best selling product </b>..',
                  title_font={'size':35,'family': 'Serif'},
                  paper_bgcolor='#F6DDCC',
                  plot_bgcolor='#F6DDCC')



fig.update_yaxes(showgrid=False)

fig.show()

In [None]:
fig = px.pie(data,values='Quantity',names='Gender',
             hover_data=['Quantity','Gender'],
             labels={'Gender':'Gender'},
             color_discrete_sequence=px.colors.sequential.OrRd_r)


fig.update_traces(textposition='inside',
                  textinfo='percent+label')


fig.update_layout(title='<b> Who buys more : Men or Women?<b>',
                  titlefont={'size': 35,'family': 'Serif'},
                  showlegend=True,
                  paper_bgcolor='#F6DDCC',
                  plot_bgcolor='#F6DDCC')



fig.show()

In [None]:
data[['Product line','Gender']][(data['Gender']=='Male')].value_counts().plot(kind='bar',title='Interests of men')
plt.show()

In [None]:
data[['Product line']][(data['Gender']=='Female')].value_counts().plot(kind='bar',color='pink',title='Interests of women')
plt.show()

In [None]:
fig=px.violin(data,x='Product line',y='gross income',
              color='Product line')


fig.update_layout(title='<b>The lowest selling product </b>..',
                  title_font={'size':35,'family': 'Serif'},
                  showlegend=False,
                  paper_bgcolor='#F6DDCC',
                  plot_bgcolor='#F6DDCC')



fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

In [None]:
fig=px.histogram(data,x='Product line',y='Unit price',
                color='Product line',nbins=5,
                 text_auto=True,opacity=.8)

fig.update_layout(title='<b>Distribution of product line according to unit price </b>..',
                  title_font={'size':35,'family': 'Serif'},
                  
                  paper_bgcolor='#F6DDCC',
                  plot_bgcolor='#F6DDCC')



fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

In [None]:
fig=px.strip(data,x='Branch',y='gross income',
             color='Branch')


fig.update_layout(title='<b>The most profitable branch </b>..',
                  title_font={'size':35,'family': 'Serif'},
                  paper_bgcolor='#F6DDCC',
                  plot_bgcolor='#F6DDCC')


fig.update_yaxes(showgrid=False)
fig.show()

In [None]:
data.pivot_table(index='Branch',columns='Gender',values='cogs',aggfunc='count').style.background_gradient(cmap='Oranges')

In [None]:
data.pivot_table(index='Branch',columns='Gender',values='cogs',aggfunc='count').plot(kind='kde',title=' Distribution of men and women according to cogs')
plt.show()

In [None]:
data.pivot_table(index='Product line',columns='Branch',values='cogs',aggfunc='max').style.background_gradient(cmap='Oranges')

In [None]:
fig=px.treemap(data,path=[px.Constant('Branches'),'Branch','Product line'],
               values='cogs',
               color_discrete_sequence=['#6E2C00','#DC7633','#EDBB99'])


fig.update_layout(title='<b>The best selling products  in every branch?</b> ..',
                  titlefont={'size': 35,'family': 'Serif'})



fig.show()

In [None]:
data[['Product line']][(data['Customer type']=='Member')].value_counts().plot(kind='bar',color='#EDBB99',title='Interests of VIP')
plt.show()

In [None]:
fig=px.histogram(data,x='Payment',color='Payment')

fig.update_layout(title='<b>The way of payment </b>..',
                  title_font={'size':35,'family': 'Serif'},
                  paper_bgcolor='#F6DDCC',
                  plot_bgcolor='#F6DDCC')



fig.update_yaxes(showgrid=False)
fig.show()

In [None]:
data['Date']=pd.to_datetime(data['Date'])

In [None]:
data.insert(10,'month',data.Date.dt.month)
data.insert(11,'day',data.Date.dt.day)

In [None]:
fig=px.histogram(data,x='month',y='Total',
                  color='month',animation_frame='day',
                  hover_data=['Product line','Gender'])


fig.update_layout(title='<b>Prices changes during the days and months of 2019 </b>..',
                  title_font={'size':35,'family': 'Serif'},
                  paper_bgcolor='#F6DDCC',
                  plot_bgcolor='#F6DDCC')




fig.show()

<a id="7"></a>
# Data preprocessing 

In [None]:
data.isna().sum()

 **<p style="color:red">Observations 📋</p>**

🔘 There is no null values

In [None]:
sns.heatmap(data.isna(),cmap='copper')

In [None]:
data=data.drop(['Invoice ID','Time','Date'],axis=1)
data.sample(3).style.set_properties(**{'background-color': '#873600',
                                        'color': '#E2EEF3'})

In [None]:
LE=LabelEncoder()

categories=['Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Payment']
for label in categories:
    data[label]=LE.fit_transform(data[label])

In [None]:
correlation=data.corr()  #To show how interconnected the data is
plt.figure(figsize=(15,7))
sns.heatmap(correlation,annot=True,fmt='.2f',annot_kws={'size': 10},linewidths=0.5,cmap='copper')
plt.title("Data correlations")
plt .show()

# Data Spliting

In [None]:
x=data.drop('Total',axis=1)
y=data['Total']



print('The dimensions of x is : ',x.shape)
print('The dimensions of y is : ',y.shape)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.33,random_state=42,shuffle=True)

In [None]:
print("x train dimensions :",x_train.shape)
print("x test dimensions: ",x_test.shape)
print("y train dimensions :",y_train.shape)
print("y test dimensions :",y_test.shape)

In [None]:
!pip install pycaret

In [None]:
!pip install --upgrade scipy


# Modelling

****LinearRegression

In [None]:
from pycaret.regression import *
s=setup(data,target='Total')

In [None]:
from sklearn.linear_model import LinearRegression
LR=LinearRegression().fit(x_train,y_train)

In [None]:
print(f"LR training score :",round(LR.score(x_train,y_train),4)*100)
print("LR testing score :",round(LR.score(x_test,y_test),4)*100)

In [None]:
LR_y_pred=LR.predict(x_test)

****XGBRegressor

In [None]:
from xgboost import XGBRegressor
xgb=XGBRegressor().fit(x_train,y_train)

In [None]:
print("xgb training score :",round(xgb.score(x_train,y_train),4)*100)
print("xgb testing score :",round(xgb.score(x_test,y_test),4)*100)

In [None]:
xgb_y_pred=xgb.predict(x_test)

****catBoostRegressor Model

In [None]:
from catboost import CatBoostRegressor
CBR=CatBoostRegressor(verbose=False).fit(x_train,y_train)

In [None]:
print("CBR training score :",round(CBR.score(x_train,y_train),4)*100)
print("CBR testing score :",round(CBR.score(x_test,y_test),4)*100)

In [None]:
CBR_y_pred=CBR.predict(x_test)

****GradientBoostingRegressor Model:

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor().fit(x_train,y_train)


In [None]:
print("GBR training score :",round(GBR.score(x_train,y_train),3)*100)
print("GBR testing score :",round(GBR.score(x_test,y_test),4)*100)

In [None]:
GBR_y_pred=GBR.predict(x_test)

# Models evaluation 

In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,median_absolute_error

In [None]:
models_predictions={'LR':LR_y_pred,'xgb':xgb_y_pred,'CBR':CBR_y_pred,'GBR':GBR_y_pred}
metrics={'mean_absolute_error':mean_absolute_error,'mean_squared_error':mean_squared_error,'median_absolute_error':median_absolute_error}

for model,y_pred in models_predictions.items():
    print(model,'Model :-','\n')
    for m,metric in metrics.items():
        MetricValue = round(metric(y_test, y_pred),2)
        print( m ,' Value is : ',MetricValue,'\n\n')
        
    


In [None]:
import shap

explainer=shap.Explainer(GBR)
shape_values=explainer(x_test)
shap.plots.bar(shape_values,max_display=15)