# Big Mart Sales Prediction

### Consider the supermart wants us to predict the sales using data of several stores

## Importing the Dependencies

In [None]:
pip install xgboost

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

In [None]:
# Loading Dataset
df=pd.read_csv('Bigmartdata.csv')

In [None]:
# Display first 5 rows
df.head()

In [None]:
# shape of the datset
df.shape

#### There are 5681 rows and 12 columns in the dataset

In [None]:
# Information about datset
df.info()

### Categorical features are
  Item_Identifier,  
  Item_Fat_Content,      
  Item_Type,          
  Outlet_Identifier,        
  Outlet_Size ,       
  Outlet_Location_Type,          
  Outlet_Type

In [None]:
# Check the missing values
df.isnull().sum()

##### Item_Weight has 1463 missing values 
##### Outlet_Size has 2410 missing values

#### Replace the float type missing values with mean and object type missing values with mode

In [None]:
df['Item_Weight'].mean()

In [None]:
df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)

In [None]:
df['Outlet_Size'].mode()

In [None]:
df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)

In [None]:
## Checking for missing  values
df.isnull().sum()

#### There are no missing values now

## Data Analysis

In [None]:
# Statistical measure
df.describe()

## Data visualization

In [None]:
# for numerical fetures

In [None]:
sns.set()

In [None]:
## Item_Weight distribution
sns.distplot(df['Item_Weight'], color='blue')

#### Mean weight is 12.85 and maximum item has weight around 12.80 

In [None]:
## Item_Visibility distribution
sns.distplot(df['Item_Visibility'], color='blue')

#### Item_visibility is positively skewed 

In [None]:
## Item_MRP distribution
sns.distplot(df['Item_MRP'], color='blue')

#### Most items has MRP 50, 100 and between 150-200 

In [None]:
## Item_Outlet_Sales distribution
sns.distplot(df['Item_Outlet_Sales'], color='blue')

#### Item_Outlet_Sales is positively skewed most items has sales price less than 2000

In [None]:
## Outlet_Establishment_Year distribution
sns.countplot('Outlet_Establishment_Year', data=df)

#### Most numbers of outlets established in 1985 and less number of outlets established in 1998 for other years same number of outlets established. 

### Visualization of categorical features

In [None]:
## Item_Fat_Content
sns.countplot('Item_Fat_Content', data=df)

In [None]:
# Item_Type
plt.figure(figsize=(20, 6))
sns.countplot('Item_Type', data=df)

#### Different types of items are there most number of items are fruits and vegetables , snaks and foods and less numbers are seafood

In [None]:
# Outlet_Size
sns.countplot('Outlet_Size', data=df)

#### Three numbers of outlet size are there most number of outlets size are median

## Data Preprocessing

In [None]:
df.head()

In [None]:
df['Item_Fat_Content'].value_counts()

In [None]:
## LF , low fat and Low Fat are same type and Regular and reg is same type
df.replace({'Item_Fat_Content':{'low fat':'Low Fat','LF':'Low Fat', 'reg':'Regular'}}, inplace=True)

In [None]:
df['Item_Fat_Content'].value_counts()

## Label Encoder

In [None]:
encoder=LabelEncoder()

In [None]:
df['Item_Identifier']=encoder.fit_transform(df['Item_Identifier'])
df['Item_Fat_Content']=encoder.fit_transform(df['Item_Fat_Content'])
df['Item_Type']=encoder.fit_transform(df['Item_Type'])
df['Outlet_Identifier']=encoder.fit_transform(df['Outlet_Identifier'])
df['Outlet_Size']=encoder.fit_transform(df['Outlet_Size'])
df['Outlet_Location_Type']=encoder.fit_transform(df['Outlet_Location_Type'])
df['Outlet_Type']=encoder.fit_transform(df['Outlet_Type'])

In [None]:
df.head()

In [None]:
df.shape

## Separating Independent features and target values

In [None]:
X=df.drop('Item_Outlet_Sales', axis=1)

In [None]:
y=df['Item_Outlet_Sales']

In [None]:
print(X)
print(y)

## Train_test_split

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

## Training  the ML  model

In [None]:
model=XGBRegressor()

In [None]:
## fit the model
model.fit(X_train, y_train)

In [None]:
## Train data prediction
y_train_predict=model.predict(X_train)

In [None]:
r2_score=metrics.r2_score(y_train,y_train_predict) 

In [None]:
r2_score

In [None]:
## Test data prediction
y_test_predict=model.predict(X_test)

In [None]:
r2_score_test=metrics.r2_score(y_test, y_test_predict)

In [None]:
r2_score_test