<a href="https://colab.research.google.com/github/hegame1998/Supermarket-store/blob/main/machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
import tensorflow as tf
import seaborn as sns
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


np.random.seed(42)
tf.random.set_seed(42)

In [2]:
df = pd.read_csv('https://github.com/hegame1998/Supermarket-store/main/Stores.csv')

HTTPError: HTTP Error 404: Not Found

In [None]:
df.head()

In [None]:
df.info()

#Exploratory data analysis

In [None]:
unique_values = df.nunique()

print(unique_values)


In [None]:
store_ids =df['Store ID']
df.drop(columns=['Store ID'], inplace=True)

In [None]:
df.describe()

In [None]:
for column_name in ["Store_Area", "Items_Available", "Daily_Customer_Count" ,"Store_Sales"]:

    # Compute statistics
    mean_value = df[column_name].mean()

    # Create a histogram trace
    hist_trace = go.Histogram(
        x=df[column_name],
        histnorm='probability density',
        name="Histogram",
        marker=dict(
            color='blue',
            line=dict(
                color='black',
                width=1
            )
        ),
        opacity=0.7
    )

    # Create a KDE trace
    kde_x_values = np.linspace(df[column_name].min(), df[column_name].max(), 200)
    kde_y_values = stats.gaussian_kde(df[column_name].values)(kde_x_values)
    kde_trace = go.Scatter(
        x=kde_x_values,
        y=kde_y_values,
        mode='lines',
        name="KDE",
        line=dict(
            color='black',
            width=1.5
        )
    )

    # Create a vertical line trace for the mean
    mean_trace = go.Scatter(
        x=[mean_value, mean_value],
        y=[0, max(kde_y_values)],
        mode='lines',
        name="Mean",
        line=dict(
            color='red',
            width=1,
            dash='dash'
        ),
        hovertemplate=f"Mean (μ): {mean_value:.2f}<extra></extra>"
    )

    # Create a figure and add traces
    fig = go.Figure([hist_trace, kde_trace, mean_trace])

    # Update layout
    fig.update_layout(
        title=f"{column_name} Distribution",
        xaxis_title="Score",
        yaxis_title="Density",
        bargap=0.01,  # gap between bars of histogram
        bargroupgap=0.1  # gap between bars of different groups
    )

    fig.show()


In [None]:
for column_name in ["Store_Area", "Items_Available", "Daily_Customer_Count"]:
    # Create a box trace
    box_trace = go.Box(
        y=df[column_name],
        name=column_name,
        marker_color='blue'
    )

    # Create a figure and add trace
    fig = go.Figure(box_trace)

    # Update layout
    fig.update_layout(
        title=f"{column_name} Box Plot",
        yaxis_title=column_name
    )

    fig.show()


In [None]:
correlation_matrix = df.corr()

# generate a heatmap
fig = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=list(correlation_matrix.columns),
    y=list(correlation_matrix.index),
    annotation_text=correlation_matrix.round(2).values,
    showscale=True,
    colorscale="Viridis"
)

# Update layout
fig.update_layout(
    title='Correlation Matrix',
    xaxis=dict(title='Variable', side='bottom'),
    yaxis=dict(title='Variable'),
    width=800,
    height=800
)

fig.show()


#Data Preprocessing and Machine Learning

In [None]:
# Select the columns to be used for LOF analysis
columns = ["Store_Area", "Items_Available", "Daily_Customer_Count"]

# Create a subset DataFrame with the selected columns
subset_df = df[columns]

# Create an instance of the LocalOutlierFactor model
lof = LocalOutlierFactor(n_neighbors=5)


outlier_scores = lof.fit_predict(subset_df)

# Create a mask to identify the outliers
outlier_mask = outlier_scores == -1

# Get the outliers from the original DataFrame
outliers = df[outlier_mask]

print("Outliers:")
print(outliers)


In [None]:
# Drop the outliers from the original DataFrame
df_no_outliers = df.drop(outliers.index)
df_no_outliers.head()


In [None]:
df_no_outliers.info()

In [None]:
# Create a scaler object
scaler = StandardScaler()

# Fit the scaler to the features and transform
scaled_data = scaler.fit_transform(df_no_outliers)

df_scaled = pd.DataFrame(scaled_data, columns=df.columns)
df_scaled.head()

In [None]:
# 'Store_Sales' is our target/dependent variable
X = df_scaled.drop('Store_Sales', axis=1)
y = df_scaled['Store_Sales']

In [None]:
# Split the scaled data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression

In [None]:
lr_model = LinearRegression()

lr_model.fit(X_train, y_train)

lr_pred =lr_model.predict(X_test)

lr_mse = mean_squared_error(y_test, lr_pred)
lr_mae = mean_absolute_error(y_test, lr_pred)

score_lr = lr_model.score(X_train, y_train)
r2_lr = r2_score(y_test, lr_pred)



# Print the evaluation metrics
print('Linear Regression')
print("MSE: %.2f" % lr_mse)
print("MAE: %.2f" % lr_mae)
print("Training score: ", score_lr)
print("R2_score : %.2f" % r2_lr)

#Neural Network

In [None]:
# Define model
model = Sequential()
model.add(Dense(10, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train model
model.fit(X_train, y_train, epochs=1500, batch_size=32, verbose=2)

# Predict the 'Store_Sales' for the test set
nn_predictions = model.predict(X_test)

# Compute evaluation metrics
mse_nn = mean_squared_error(y_test, nn_predictions)
mae_nn  = mean_absolute_error(y_test, nn_predictions)
score_nn  = model.evaluate(X_train, y_train)
r2_nn  = r2_score(y_test, nn_predictions)

# Print the evaluation metrics
print('Neural Network')
print("MSE: %.2f" % mse_nn )
print("MAE: %.2f" % mae_nn )
print("Training loss: %.2f" % score_nn )
print("R2_score : %.2f" % r2_nn)


#Decision Tree Regressor

In [None]:
# Initialize the model
dt_model = DecisionTreeRegressor(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Predict the 'Store_Sales' for the test set
dt_predictions = dt_model.predict(X_test)

# Compute evaluation metrics
mse_dt = mean_squared_error(y_test, dt_predictions)
mae_dt = mean_absolute_error(y_test, dt_predictions)
score_dt = dt_model.score(X_train, y_train)

# Compute R-squared score
r2_dt = r2_score(y_test, dt_predictions)

# Print the evaluation metrics
print('Decision Tree')
print("MSE: %.2f" % mse_dt)
print("MAE: %.2f" % mae_dt)
print("Training score: %.2f" % score_dt)
print("R2_score : %.2f" % r2_dt)


#Comparison

In [None]:
# Define the models and their respective evaluation metrics
models = ['Linear Regression', 'Neural Network','Decision Tree' ]
mse = [lr_mse, mse_nn, mse_dt]
mae = [lr_mae, mae_nn, mae_dt]
r2 = [r2_lr, r2_nn, r2_dt]

# Create bar traces for each metric
trace_mse = go.Bar(x=models, y=mse, name='MSE')
trace_mae = go.Bar(x=models, y=mae, name='MAE')
trace_r2 = go.Bar(x=models, y=r2, name='R2 Score')

# Create a layout for the plot
layout = go.Layout(
    title='Comparison of Evaluation Metrics',
    xaxis_title='Models',
    yaxis_title='Metric Value'
)

# Create a figure and add the traces to it
fig = go.Figure(data=[trace_mse,trace_mae, trace_r2], layout=layout)

# Show the plot
fig.show()


# Data Mining

In [None]:
df_avgs = df.copy()
df_avgs.head()

In [None]:
df_avgs['ratio_items/cutomers'] = df_avgs['Items_Available']/df_avgs['Daily_Customer_Count']
df_avgs['ratio_size/customers'] = df_avgs['Store_Area']/df_avgs['Daily_Customer_Count']
df_avgs['ratio_size/items']     = df_avgs['Store_Area']/df_avgs['Items_Available']
df_avgs['target_sales'] = df['Store_Sales']

df_avgs.head()


In [None]:
df_avgs.drop(['Store_Area','Items_Available','Daily_Customer_Count','Store_Sales'],axis = 1, inplace = True)

In [None]:
df_avgs.head()

# Clustering Data

In [None]:
# Determine the optimal number of clusters using the KElbowVisualizer

x= df_avgs.drop('target_sales',axis= 1)
kmeans_model = KMeans(n_init=10)
visualizer = KElbowVisualizer(kmeans_model, k=(2, 10),n_init =10)  # Try different values of k
visualizer.fit(x)
visualizer.show()


In [None]:
kmeans_model = KMeans(n_clusters=4,n_init=10, random_state=42)
x['Cluster'] = kmeans_model.fit_predict(x)
x.head()

In [None]:
y = df_avgs['target_sales']

sales_join = x.join(y)
sales_join.columns

In [None]:
sales_join.head()

In [None]:
sales_x = sales_join[['Cluster','target_sales']]


kmeans_model = KMeans(n_init=10)
visualizer = KElbowVisualizer(kmeans_model, k=(2, 10),n_init =10)  # Try different values of k
visualizer.fit(sales_x)
visualizer.show()



In [None]:
kmeans_model = KMeans(n_clusters=4,n_init=10, random_state=42)
x['Target_Groups'] = kmeans_model.fit_predict(sales_x)
x.head()

In [None]:
combined_df = pd.concat([x, y], axis=1)
combined_df.head()

In [None]:
# Select the columns you want to scale
columns_to_scale = ['ratio_items/cutomers', 'ratio_size/customers', 'ratio_size/items', 'target_sales']

# Extract the columns you want to scale into a separate DataFrame
X = combined_df[columns_to_scale]

# Instantiate the StandardScaler
scaler = StandardScaler()

# Fit and transform the selected columns
X_scaled = scaler.fit_transform(X)

# Replace the original columns in the DataFrame with the scaled values
combined_df[columns_to_scale] = X_scaled


x= combined_df.drop('target_sales',axis= 1)
y = combined_df['target_sales']

In [None]:
combined_df.head()

# Now Doing the Models Prediction on the clustered Data

## Second Linear Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

In [None]:
new_lr_model = LinearRegression()
new_lr_model.fit(X_train, y_train)
new_lr_pred =new_lr_model.predict(X_test)


new_lr_mse = mean_squared_error(y_test, new_lr_pred)
new_lr_mae = mean_absolute_error(y_test, new_lr_pred)

new_score_lr = new_lr_model.score(X_train, y_train)
new_r2_lr = r2_score(y_test, new_lr_pred)


# Print the evaluation metrics
print('Linear Regression')
print("MSE: %.2f" % new_lr_mse)
print("MAE: %.2f" % new_lr_mae)
print("Training score: ", new_score_lr)
print("R2_score : %.2f" % new_r2_lr)

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_pred =dtr.predict(X_test)

new_mse_dt = mean_squared_error(y_test, dtr_pred)
new_mae_dt = mean_absolute_error(y_test, dtr_pred)

new_score_dt = dtr.score(X_train, y_train)
new_r2_dt =r2_score(y_test,  dtr_pred)

print('DecisionTree Regressor')
print("MSE: %.2f" % new_mse_dt)
print("MAE: %.2f" % (new_mae_dt))
print("Training score: ", new_score_dt)
print("R2_score : %.2f"%(new_r2_dt))