### Notebook Overview

In this notebook, the tasks are summarized below:

1: **[Sales Analytics](#section-1-sales-analytics)**

2: **[Store Analysis](#section-2-store-analysis)**

3: **[Geo-location Analysis](#section-3-geo-location-analysis)**

4: **[Demographic Analysis](#section-4-demographic-analysis)**

5: **[Operational Analysis](#section-5-operational-analysis)**

6: **[Review and Rating Analysis](#section-6-review-and-rating-analysis)**


---

In [1]:
import pandas as pd 
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('customers_df.csv')
df = data.copy()

In [3]:
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,id,store_category,store_sub_category,description,city,latitude,longitude,distance,distance_to_city_center,store_size,opening_hour,closing_hour,opening_duration,date_opening,opening_timeofday,closing_timeofday,parking,population,young_population,young_pop_percentage,gdb_per_capita,unemployment_rate,population_density,young_population_density,number_of_reviews,rating,rating_round,number_of_employees,sales_representative_id,sales_rep_pop,sales_amount,sales_per_employee,monthly_avg_sales,sales_per_capita
0,1.0,restaurant,mexican restaurant,Vibrant hues of turquoise and red adorn the ex...,burgos,42.352143,-3.677571,35.820157,2.339378,1547.041667,13.0,1.0,12.0,2019-07-10,Afternoon,Night,False,355045,32414,9.129547,28942,0.0624,229.499313,20.952248,58,3.101,3.1,33.0,25,0.064781,29759.508604,901.803291,2479.95905,0.083819
1,2.0,bar,pub,"Warm wooden accents, rich Guinness aromas, and...",a coruña,43.351256,-8.410301,31.68167,2.510863,1315.0,23.0,6.0,7.0,2020-02-17,Evening,Morning,False,246047,29526,12.000146,21898,0.1092,187.107985,22.453232,13,4.14,4.1,29.0,22,0.125179,38363.039529,1322.863432,3196.919961,0.155918
2,3.0,restaurant,greek restaurant,"Warmly lit, rustic tables and vintage amphorae...",bilbao,43.255756,-2.939133,29.656813,1.360483,766.0,13.0,1.0,12.0,2019-04-15,Afternoon,Night,True,353187,27195,7.699887,28618,0.0928,461.079634,35.502611,22,3.916,3.9,36.0,10,0.0957,26352.77712,732.021587,2196.06476,0.074614
3,4.0,restaurant,spanish restaurant,Vibrant eatery serving a medley of small plate...,alicante,38.354208,-0.505718,31.277988,2.207102,815.0,11.0,23.0,12.0,2019-04-17,Morning,Evening,False,33441,6153,18.399569,17405,0.1413,41.031902,7.549693,33,3.803,3.8,26.0,15,0.684788,27910.264616,1073.471716,2325.855385,0.834612
4,5.0,hotel,luxury hotel,Luxurious haven where luck and indulgence conv...,madrid,40.463496,-3.635538,38.071841,7.714787,2317.0,6.0,23.0,17.0,2019-04-05,Morning,Evening,True,3273049,209475,6.399996,29576,0.1018,1412.623651,90.407855,47,3.958,4.0,31.0,29,0.010388,33478.441029,1079.949711,2789.870086,0.010229


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9637 entries, 0 to 9636
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        9637 non-null   float64
 1   store_category            9637 non-null   object 
 2   store_sub_category        9637 non-null   object 
 3   description               9637 non-null   object 
 4   city                      9637 non-null   object 
 5   latitude                  9637 non-null   float64
 6   longitude                 9637 non-null   float64
 7   distance                  9637 non-null   float64
 8   distance_to_city_center   9637 non-null   float64
 9   store_size                9637 non-null   float64
 10  opening_hour              9637 non-null   float64
 11  closing_hour              9637 non-null   float64
 12  opening_duration          9637 non-null   float64
 13  date_opening              9637 non-null   object 
 14  opening_

In [5]:
# change id from float64 to object 

df['id'] = df['id'].astype('object')
df['sales_representative_id'] = df['sales_representative_id'].astype('object')
df['date_opening'] = pd.to_datetime(df['date_opening'])

<a id="section-1-sales-analytics"></a>
### 1: Sales Analytics

#### Identifying key areas with high sales activity, which are potential hotspots for business focus

In [6]:
heatmap_fig = px.density_mapbox(
    df, 
    lat="latitude", 
    lon="longitude", 
    z='sales_amount', 
    radius=10,  # Adjust radius as needed
    height=500,
    title="Heatmap of Sales Amount"
)

heatmap_fig.update_layout(mapbox_style="open-street-map")

heatmap_fig.show()

#### Comparison of Total and Scaled Mean Sales by Store Category

In [7]:
mean_sales_by_category = df.groupby('store_category')['sales_amount'].mean().reset_index()
mean_sales_by_category.rename(columns={'sales_amount': 'mean_sales_amount'}, inplace=True)

sum_sales_by_category = df.groupby('store_category')['sales_amount'].sum().reset_index()
sum_sales_by_category.rename(columns={'sales_amount': 'total_sales_amount'}, inplace=True)

# Scale mean_sales_amount for visualization purposes
scaling_factor = 1000  # Adjust this factor as needed for better visualization
mean_sales_by_category['mean_sales_amount_scaled'] = mean_sales_by_category['mean_sales_amount'] * scaling_factor

# Merge the DataFrames for easier plotting
sales_by_category = pd.merge(sum_sales_by_category, mean_sales_by_category[['store_category', 'mean_sales_amount_scaled']], on='store_category')

# Melt the DataFrame to long format for plotting
sales_by_category_melted = sales_by_category.melt(id_vars='store_category', 
                                                  value_vars=['total_sales_amount', 'mean_sales_amount_scaled'], 
                                                  var_name='sales_type', 
                                                  value_name='sales_amount')

# Create the bar plot
fig = px.bar(sales_by_category_melted, 
             x='store_category', 
             y='sales_amount', 
             color='sales_type', 
             barmode='group',
             title='Comparison of Total and Scaled Mean Sales by Store Category',
             labels={'sales_amount': 'Sales Amount', 'store_category': 'Store Category', 'sales_type': 'Sales Type'})

# Add a note about the scaling factor in the title
fig.update_layout(
    title='Comparison of Total and Scaled Mean Sales by Store Category (Mean Sales Scaled by 1000)'
)

# Show the figure
fig.show()


In [36]:
sales_by_category = df.groupby('store_sub_category')['sales_amount'].sum().reset_index()
fig = px.bar(sales_by_category, x='store_sub_category', y='sales_amount', title='Sales by Store Sub Category (Sum)')
fig.show()

In [37]:
sales_by_category = df.groupby('store_sub_category')['sales_amount'].mean().reset_index()
fig = px.bar(sales_by_category, x='store_sub_category', y='sales_amount', title='Sales by Store Sub Category (Mean)')
fig.show()

#### Sales per Employee

Evaluating sales per employee helps in assessing productivity and optimizing workforce allocation

In [9]:
sales_per_employee = df.groupby('store_category')['sales_per_employee'].mean().reset_index()
fig = px.bar(sales_per_employee, x='store_category', y='sales_per_employee', title='Sales per Employee by Store Category')
fig.show()

#### Sales Amount vs Population

In [10]:
# Group by city and sum the population and sales amount
city_pop_sales = df.groupby('city')[['population', 'sales_amount']].sum().reset_index()

# Plotting
fig = px.scatter(city_pop_sales, x='population', y='sales_amount', 
                 title='Population vs Sales Amount', labels={'population':'Population', 'sales_amount':'Sales Amount'})
fig.show()

#### Sales Amount vs GDP per Capita

In [11]:
# Group by city and calculate mean GDP per capita and sum sales amount
city_gdp_sales = df.groupby('city')[['gdb_per_capita', 'sales_amount']].agg({'gdb_per_capita': 'mean', 'sales_amount': 'sum'}).reset_index()

# Plotting
fig = px.scatter(city_gdp_sales, x='gdb_per_capita', y='sales_amount', 
                 title='GDP per Capita vs Sales Amount', labels={'gdb_per_capita':'GDP per Capita', 'sales_amount':'Sales Amount'})
fig.show()


#### Sales Amount vs Unemployment Rate

In [12]:
# Group by city and calculate mean unemployment rate and sum sales amount
city_unemployment_sales = df.groupby('city')[['unemployment_rate', 'sales_amount']].agg({'unemployment_rate': 'mean', 'sales_amount': 'sum'}).reset_index()

# Plotting
fig = px.scatter(city_unemployment_sales, x='unemployment_rate', y='sales_amount', 
                 title='Unemployment Rate vs Sales Amount', labels={'unemployment_rate':'Unemployment Rate', 'sales_amount':'Sales Amount'})
fig.show()

<a id="section-2-store-analysis"></a>
### 2: Store Analysis

#### Sales Performance by Store Size

In [13]:
sales_by_size = df.groupby('store_size')['sales_amount'].sum().reset_index()
fig = px.bar(sales_by_size, x='store_size', y='sales_amount', title='Sales by Store Size')
fig.update_layout(template='none')
fig.show()


#### Store Performance Metrics (Opening Hours, Sales, etc.)

Analyzing performance metrics like opening duration helps in optimizing operational hours for maximum sales

In [14]:
store_metrics = df.groupby(['store_category', 'opening_duration'])['sales_amount'].mean().reset_index()
fig = px.density_heatmap(store_metrics, x='store_category', y='opening_duration', z='sales_amount', title='Store Performance Metrics')
fig.show()

#### Top vs Bottom Performers

Identifying top and bottom performers helps in understanding best practices and areas needing improvement

In [15]:
top_stores = df.nlargest(10, 'sales_amount')
bottom_stores = df.nsmallest(10, 'sales_amount')

fig = px.bar(top_stores, x='store_category', y='sales_amount', title='Top 10 Performing Stores')
fig.show()

fig = px.bar(bottom_stores, x='store_category', y='sales_amount', title='Bottom 10 Performing Stores')
fig.show()

#### Average Sales Amount by Store Sub Category

In [16]:
average_sales = df.groupby('store_sub_category')['sales_amount'].mean().reset_index()
average_sales = average_sales.sort_values(by='sales_amount', ascending=False)

fig = px.bar(
    average_sales, 
    x='store_sub_category', 
    y='sales_amount', 
    color='sales_amount',
    title='Average Sales Amount by Store Sub Category'
)

# add template
fig.update_layout(template='none')
fig.update_xaxes(title_text='')
fig.update_yaxes(title_text='Average Sales Amount')

fig.show()


In [17]:
fig = px.box(df, x='store_sub_category', y='sales_amount', title='Sales Amount Distribution by store_sub_category')
fig.update_xaxes(title_text='store_sub_category')
fig.update_yaxes(title_text='Sales Amount')
fig.show()

<a id="section-3-geo-location-analysis"></a>
### 3: Geo-location Analysis

#### Which city has the higest sales?

In [18]:
sales_by_city = df.groupby('city')['sales_amount'].sum().reset_index()

# Create bar chart using Plotly
fig = px.bar(sales_by_city, x='city', y='sales_amount', title='Total Sales Amount by City', labels={'sales_amount': 'Total Sales Amount', 'city': 'City'})

# Customize layout
fig.update_layout(xaxis_tickangle=-45)

# Show the plot
fig.show()

In [19]:
fig = px.box(df, x='city', y='sales_amount', title='Sales Amount Distribution by City')
fig.update_xaxes(title_text='City')
fig.update_yaxes(title_text='Sales Amount')
fig.show()

#### Geospatial Distribution of Store Categories and Sales Amounts

In [34]:
df_filtered = df[df['sales_amount'] > 0]

heatmap_fig = px.scatter_mapbox(
    df_filtered, 
    lat="latitude", 
    lon="longitude", 
    size="sales_amount",   
    color="store_category",   
    height=500,
    hover_name='store_category',
    hover_data=["id"],  
    title="Heatmap of all store categories"
)

heatmap_fig.update_layout(
    mapbox_style="open-street-map"
)

heatmap_fig.show()

#### heapmap - higher sales area

In the north, hotels contribute to higher sales, while in the south, restaurants are the primary contributors.

In [38]:
top_sales_amount = df[df['sales_amount'] > 33855]

heatmap_fig = px.scatter_mapbox(
    top_sales_amount, 
    lat="latitude", 
    lon="longitude", 
    color="store_category", 
    height=500,
    hover_name="store_sub_category",
    hover_data=["id"],  
    title="Heatmap of Sales Amount (higher-end)"
)

heatmap_fig.update_layout(
    mapbox_style="open-street-map",
)

heatmap_fig.show()

#### heapmap - lower sales area

from this graph we can see that bars contribute more to lower average sales 

In [39]:
top_sales_amount = df[df['sales_amount'] < 28421]

heatmap_fig = px.scatter_mapbox(
    top_sales_amount, 
    lat="latitude", 
    lon="longitude", 
    color="store_category", 
    height=500,
    hover_name="store_sub_category",
    hover_data=["id"],  
    title="Heatmap of Sales Amount (lower-end)"
)

heatmap_fig.update_layout(
    mapbox_style="open-street-map",
)

heatmap_fig.show()

<a id="section-4-demographic-analysis"></a>
### 4: Demographic Analysis

#### Impact of Population on Sales

In [35]:
pop_sales_corr = df[['population', 'sales_amount']].corr()
fig = px.scatter(df, x='population', y='sales_amount', title='Population vs Sales')
fig.show()

In [40]:
df.columns

Index(['id', 'store_category', 'store_sub_category', 'description', 'city',
       'latitude', 'longitude', 'distance', 'distance_to_city_center',
       'store_size', 'opening_hour', 'closing_hour', 'opening_duration',
       'date_opening', 'opening_timeofday', 'closing_timeofday', 'parking',
       'population', 'young_population', 'young_pop_percentage',
       'gdb_per_capita', 'unemployment_rate', 'population_density',
       'young_population_density', 'number_of_reviews', 'rating',
       'rating_round', 'number_of_employees', 'sales_representative_id',
       'sales_rep_pop', 'sales_amount', 'sales_per_employee',
       'monthly_avg_sales', 'sales_per_capita', 'sales_per_hour'],
      dtype='object')

In [42]:
pop_dense_corr = df[['population', 'sales_amount']].corr()
fig = px.scatter(df, x='young_population_density', y='sales_amount', title='Young Population Density vs Sales')
fig.show()

In [50]:
# which city has the highest population density with sales amount avegae as line 

city_pop_density = df.groupby('city')[['population_density', 'sales_amount']].sum().reset_index()
city_pop_density = city_pop_density.sort_values(by='population_density', ascending=False)

fig = px.bar(city_pop_density, x='city', y='population_density', title='Population Density by City')
fig.add_trace(go.Scatter(x=city_pop_density['city'], y=city_pop_density['sales_amount'], mode='lines+markers', name='Sales Amount'))
fig.update_layout(template='none')
fig.show()

<a id="section-5-operational-analysis"></a>
### 5: Operational Analysis

#### Sales per Opening Hour

In [24]:
df['sales_per_hour'] = df['sales_amount'] / df['opening_duration']
sales_per_hour_analysis = df.groupby('store_category')['sales_per_hour'].mean().reset_index()
fig = px.bar(sales_per_hour_analysis, x='store_category', y='sales_per_hour', title='Sales per Opening Duration by Store Category')
fig.show()

#### Impact of Parking Availability on Sales 

dont see an impact of parking to the sales

In [25]:
parking_sales_corr = df.groupby('parking')['sales_amount'].mean().reset_index()
fig = px.bar(parking_sales_corr, x='parking', y='sales_amount', title='Sales by Parking Availability')
fig.show()

In [26]:
city_pop = df.groupby('city')[['population', 'gdb_per_capita', 'unemployment_rate', 'sales_amount']].sum().reset_index()
# sort from high to low
city_pop = city_pop.sort_values(by='sales_amount', ascending=False)
city_pop

Unnamed: 0,city,population,gdb_per_capita,unemployment_rate,sales_amount
7,madrid,6958502174,62878576,216.4268,67997810.0
2,barcelona,2874323175,47092525,162.945,56602480.0
11,sevilla,871092926,22541851,221.2993,39248100.0
12,valencia,2319129960,21045830,122.642,28418460.0
1,alicante,27354738,14237290,115.5834,25920770.0
3,bilbao,230277924,18658936,60.5056,20745180.0
0,a coruña,159192409,14168006,70.6524,20620370.0
14,zaragoza,375367276,13983400,47.26,17814430.0
13,valladolid,83928852,6430816,24.871,8622732.0
5,caceres,97726860,5129208,37.2456,8033543.0


#### Impact of nemployee number on Sales 

In [27]:
fig_employees_sales = px.scatter(df, x='number_of_employees', y='sales_amount', title='Impact of Number of Employees on Sales')

fig_employees_sales.show()

#### Sales rep's store coverage

In [28]:
sales_rep_performance = df.groupby('sales_representative_id')['sales_amount'].sum().reset_index()

sales_rep_performance = sales_rep_performance.sort_values(by='sales_amount', ascending=False)

stores_managed = df.groupby('sales_representative_id')['id'].nunique().reset_index()
stores_managed.columns = ['sales_representative_id', 'number_of_stores']

sales_rep_performance = sales_rep_performance.merge(stores_managed, on='sales_representative_id')

sales_rep_performance

Unnamed: 0,sales_representative_id,sales_amount,number_of_stores
0,29,10951930.0,340
1,31,10930030.0,340
2,14,10907930.0,340
3,6,10841460.0,339
4,30,10838970.0,340
5,26,10797410.0,338
6,10,10789180.0,338
7,0,10786560.0,339
8,20,10785920.0,340
9,23,10769790.0,339


In [54]:
df_sales_rep_pop = df.groupby('sales_representative_id')['sales_rep_pop'].mean().reset_index()
fig = px.bar(df_sales_rep_pop, x='sales_representative_id', y='sales_rep_pop', title='Average Sales Representative Population Percentage by Representative')
fig.update_layout(xaxis_title='Sales Representative ID', yaxis_title='Average Sales Representative Population Percentage')
fig.show()

<a id="section-6-review-and-rating-analysis"></a>
### 6: Review and Rating Analysis

#### Store Ratings and Sales Correlation

In [29]:
rating_sales_corr = df[['rating', 'sales_amount']].corr()
fig = px.scatter(df, x='rating', y='sales_amount', title='Rating vs Sales')
fig.show()

#### Rating Distribution

In [30]:
fig = px.histogram(df, x='rating_round', title='Rating Distribution')
fig.show()

#### Number of Reviews and Sales Amount

In [31]:
data = df.copy()
data.drop(data[data['number_of_reviews'] == 12000].index, inplace=True)
data.drop(data[data['number_of_reviews'] == 9000].index, inplace=True)

In [32]:
# Relationship between number of reviews, average rating, and sales amount
fig_reviews_sales = px.scatter(data, x='number_of_reviews', y='sales_amount', size='rating', color='rating', 
                               title='Number of Reviews and Sales Amount')
fig_reviews_sales.show()

# Distribution of ratings across different store categories
fig_rating_distribution = px.box(df, x='store_sub_category', y='rating', title='Rating Distribution by Store Category')
fig_rating_distribution.show()

### Top cities by number of reviews

In [33]:
top_cities_reviews = df.groupby('city')['number_of_reviews'].sum().nlargest(10).reset_index()

fig = px.bar(top_cities_reviews, 
             x='city', 
             y='number_of_reviews', 
             title='Top Cities by Number of Reviews', 
             labels={'number_of_reviews': 'Number of Reviews', 'city': 'City'}, 
             color='number_of_reviews',
             color_continuous_scale=px.colors.sequential.Blues)

fig.update_layout(xaxis_tickangle=-45)

fig.show()