# Big Market Sales Prediction

# Importing all essentials

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
sns.set_style('whitegrid')
warnings.filterwarnings('ignore')

# Data Analysis and Collection

In [None]:
df = pd.read_csv('../input/bigmart-sales-data/Train.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

Categorical Features:
- Item_Identifier
- Item_Fat_Content
- Item_Type
- Outlet_Indentifier
- Outlet_Size
- Outlet_Location-Type
- Outlet_Type

In [None]:
# checking for missing values
df.isnull().sum()

Handling Missing Values

- MEAN -> AVERAGE
- MODE -> MORE REPEATED VALUE

In [None]:
df['Item_Weight'].mean()

In [None]:
# filling missing values in "Item-Weight" with MEAN VALUE
df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)

In [None]:
# Mode of "Outlet_Size" column
df['Outlet_Size'].mode()

In [None]:
# filling the missing values in "Outlet_Size" column with Mode
mode_of_Outlet_size = df.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))

In [None]:
print(mode_of_Outlet_size)

In [None]:
miss_values = df['Outlet_Size'].isnull()

In [None]:
print(miss_values)

In [None]:
df.loc[miss_values, 'Outlet_Size'] = df.loc[miss_values, 'Outlet_Type'].apply(lambda x:mode_of_Outlet_size[x])

In [None]:
#checking for missing values
df.isnull().sum()

# Data Analysis

In [None]:
df.describe()

In [None]:
# Item_Weight distribution
plt.figure(figsize=(6,6))
sns.distplot(df['Item_Weight'])
plt.show()

In [None]:
# Item Visibility distribution
plt.figure(figsize=(6,6))
sns.distplot(df['Item_Visibility'])
plt.show()

In [None]:
# Item MRP distribution
plt.figure(figsize=(6,6))
sns.distplot(df['Item_MRP'])
plt.show()

In [None]:
# Item_Outlet_Sales distribution
plt.figure(figsize=(6,6))
sns.distplot(df['Item_Outlet_Sales'])
plt.show()

In [None]:
# Outlet_Establishment_Year column
plt.figure(figsize=(6,6))
sns.countplot(x='Outlet_Establishment_Year', data=df)
plt.show()

In [None]:
# Item_Fat_Content column
plt.figure(figsize=(6,6))
sns.countplot(x='Item_Fat_Content', data=df)
plt.show()

In [None]:
# Item_Type column
plt.figure(figsize=(30,6))
sns.countplot(x='Item_Type', data=df)
plt.show()

In [None]:
# Outlet_Size column
plt.figure(figsize=(6,6))
sns.countplot(x='Outlet_Size', data=df)
plt.show()

# Data Pre=Processing

In [None]:
df.head()

In [None]:
df['Item_Fat_Content'].value_counts()

In [None]:
df.replace({'Item_Fat_Content':{'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'}},inplace=True)

In [None]:
df['Item_Fat_Content'].value_counts()

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()

In [None]:
df['Item_Identifier'] = encoder.fit_transform(df['Item_Identifier'])

df['Item_Fat_Content'] = encoder.fit_transform(df['Item_Fat_Content'])

df['Item_Type'] = encoder.fit_transform(df['Item_Type'])

df['Outlet_Identifier'] = encoder.fit_transform(df['Outlet_Identifier'])

df['Outlet_Size'] = encoder.fit_transform(df['Outlet_Size'])

df['Outlet_Location_Type'] = encoder.fit_transform(df['Outlet_Location_Type'])

df['Outlet_Type'] = encoder.fit_transform(df['Outlet_Type'])

In [None]:
df.head()

# Splitting features and Target

In [None]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [None]:
print(X)

In [None]:
print(y)

# Splitting into Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

# Training Algortihm

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
model = Sequential()

model.add(Dense(20, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(5, activation='relu'))

model.add(Dense(1)) # output layer

model.compile(optimizer='rmsprop', loss='mse')

In [None]:
model.fit(x = X_train, y = y_train, epochs=50)

In [None]:
model.summary()

In [None]:
loss_df = pd.DataFrame(model.history.history)

In [None]:
loss_df

In [None]:
loss_df.plot()

From above graph we can say that loss of our model decreasing significantly.

# Model Evaluation

In [None]:
# Model evaluation on test data
test_eval = model.evaluate(X_test, y_test, verbose=0)
print(test_eval)

In [None]:
# model evaluation on train set
train_eval = model.evaluate(X_train, y_train, verbose=0)
print(train_eval)

In [None]:
# Checking difference between train_eval and test_eval
model_diff = train_eval - test_eval
print(model_diff)

In [None]:
# Train prediction on test data
train_prediction = model.predict(X_train)

In [None]:
# Test prediction on test data
test_prediction = model.predict(X_test)

In [None]:
print(train_prediction)

In [None]:
print(test_prediction)

In [None]:
# R Squared : R-squared measures the strength of the relationship between your model and the dependent variable on a convenient 0 – 100% scale.
from sklearn.metrics import r2_score
r2_train = r2_score(y_train, train_prediction)

In [None]:
print('R squared value of train data:', r2_train)

In [None]:
r2_test = r2_score(y_test, test_prediction)

In [None]:
print('R squared value of test data:',r2_test)