## Approach 1: Cautious Expansion with Focus on Efficiency

This approach focuses on sustainable, profitable growth. It's more suitable for businesses in stable markets or those prioritizing profitability over rapid expansion. It also allows for more careful resource allocation and risk management.

1. Data preparation and customer clustering

2. Optimal cluster selection

3. Assignment of potential customers to current reps

4. Analysis of new hire needs

5. Threshold-based optimization for new hires

6. Visualizations of clustering and sales performance

7. Strategic recommendations for expansion

8. Final customer assignments and geographical distribution

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from sklearn.metrics import silhouette_score


### Loading the Data 

In [2]:
# Load data
current_customers = pd.read_csv('customers_df.csv')
potential_customers = pd.read_csv('potential_customers_df.csv')
store_df = pd.read_csv('store_count.csv')

In [3]:
# Constants
MAX_CUSTOMERS_PER_REP = 300
MAX_NEW_HIRES = 50

### Data Preparation for K-means 

In [4]:
# Data preparation
def prepare_data(current_customers, potential_customers):
    all_customers = pd.concat([current_customers, potential_customers], ignore_index=True)
    
    numeric_columns = ['latitude', 'longitude', 'sales_amount', 'sales_per_employee']
    
    imputer = SimpleImputer(strategy='mean')
    all_customers[numeric_columns] = imputer.fit_transform(all_customers[numeric_columns])
    
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(all_customers[numeric_columns])
    
    return scaled_features, all_customers

# Prepare data
scaled_features, all_customers = prepare_data(current_customers, potential_customers)

### Find Optimal K-means Clusters 

In [5]:
def kmeans_clusters(scaled_features, max_clusters=20):
    inertias = []
    silhouette_scores = []

    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=100, max_iter=300, tol=1e-4)
        cluster_labels = kmeans.fit_predict(scaled_features)
        inertias.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(scaled_features, cluster_labels))

    return inertias, silhouette_scores

kmeans_inertias, kmeans_silhouette_scores = kmeans_clusters(scaled_features)

# Plot KMeans inertia and silhouette scores
fig = go.Figure()

fig.add_trace(go.Scatter(x=list(range(2, len(kmeans_inertias) + 2)), y=kmeans_inertias, mode='lines+markers', name='KMeans Inertia'))
fig.add_trace(go.Scatter(x=list(range(2, len(kmeans_silhouette_scores) + 2)), y=kmeans_silhouette_scores, mode='lines+markers', name='KMeans Silhouette Score', yaxis='y2'))

fig.update_layout(
    title='KMeans Inertia and Silhouette Scores',
    xaxis=dict(title='Number of Clusters'),
    yaxis=dict(title='Inertia'),
    yaxis2=dict(title='Silhouette Score', overlaying='y', side='right'),
    legend=dict(x=1.1, y=1)
)

fig.show()

### Find Optimal Structure Using Gaussian Mixture Model

In [6]:
from sklearn.mixture import GaussianMixture

def find_optimal_gmm_clusters(scaled_features, max_clusters=20):
    aic_scores = []
    bic_scores = []
    silhouette_scores = []

    for k in range(2, max_clusters + 1):
        gmm = GaussianMixture(n_components=k, random_state=42)
        gmm.fit(scaled_features)
        cluster_labels = gmm.predict(scaled_features)
        aic_scores.append(gmm.aic(scaled_features))
        bic_scores.append(gmm.bic(scaled_features))
        silhouette_scores.append(silhouette_score(scaled_features, cluster_labels))

    return aic_scores, bic_scores, silhouette_scores

gmm_aic_scores, gmm_bic_scores, gmm_silhouette_scores = find_optimal_gmm_clusters(scaled_features)

# Plot GMM AIC/BIC and silhouette scores
fig = go.Figure()

fig.add_trace(go.Scatter(x=list(range(2, len(gmm_aic_scores) + 2)), y=gmm_aic_scores, mode='lines+markers', name='GMM AIC'))
fig.add_trace(go.Scatter(x=list(range(2, len(gmm_bic_scores) + 2)), y=gmm_bic_scores, mode='lines+markers', name='GMM BIC'))
fig.add_trace(go.Scatter(x=list(range(2, len(gmm_silhouette_scores) + 2)), y=gmm_silhouette_scores, mode='lines+markers', name='GMM Silhouette Score', yaxis='y2'))

fig.update_layout(
    title='GMM AIC/BIC and Silhouette Scores',
    xaxis=dict(title='Number of Clusters'),
    yaxis=dict(title='AIC/BIC'),
    yaxis2=dict(title='Silhouette Score', overlaying='y', side='right'),
    legend=dict(x=1.1, y=1)
)

fig.show()


### Find DBSCAN clustering method 

In [7]:
from sklearn.cluster import DBSCAN

def perform_dbscan_clustering(scaled_features, eps_values, min_samples_values):
    best_silhouette = -1
    best_eps = None
    best_min_samples = None
    best_labels = None

    for eps in eps_values:
        for min_samples in min_samples_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            cluster_labels = dbscan.fit_predict(scaled_features)
            
            if len(set(cluster_labels)) > 1:
                silhouette_avg = silhouette_score(scaled_features, cluster_labels)
                if silhouette_avg > best_silhouette:
                    best_silhouette = silhouette_avg
                    best_eps = eps
                    best_min_samples = min_samples
                    best_labels = cluster_labels

    return best_labels, best_silhouette, best_eps, best_min_samples

dbscan_labels, dbscan_silhouette_score, best_eps, best_min_samples = perform_dbscan_clustering(
    scaled_features, eps_values=np.arange(0.1, 1.0, 0.1), min_samples_values=np.arange(3, 10, 1)
)
print(f"Best Silhouette Score for DBSCAN: {dbscan_silhouette_score} with eps={best_eps} and min_samples={best_min_samples}")

# Visualize the best silhouette score for DBSCAN
fig = go.Figure()

fig.add_trace(go.Scatter(x=[best_eps], y=[dbscan_silhouette_score], mode='markers', marker=dict(size=10), name='DBSCAN Silhouette Score'))

fig.update_layout(
    title='Best Silhouette Score for DBSCAN',
    xaxis=dict(title='eps'),
    yaxis=dict(title='Silhouette Score'),
    legend=dict(x=1.1, y=1)
)

fig.show()


Best Silhouette Score for DBSCAN: 0.28776371580528265 with eps=0.6 and min_samples=6


### Find The Optimal Clusters 

In [8]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.graph_objects as go

def find_optimal_kmeans_clusters(scaled_features, max_clusters=20):
    inertias = []
    silhouette_scores = []

    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=100, max_iter=300, tol=1e-4)
        cluster_labels = kmeans.fit_predict(scaled_features)
        inertias.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(scaled_features, cluster_labels))      

  # Plot the elbow curve and silhouette scores
    fig = make_subplots(rows=2, cols=1, shared_xaxes=True)
    
    fig.add_trace(
        go.Scatter(x=list(range(2, max_clusters + 1)), y=inertias, mode='lines+markers', name='Inertia'),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(x=list(range(2, max_clusters + 1)), y=silhouette_scores, mode='lines+markers', name='Silhouette Score'),
        row=2, col=1
    )
    
    fig.update_layout(height=800, title_text="Elbow Method and Silhouette Scores")
    fig.update_xaxes(title_text="Number of Clusters (k)")
    fig.update_yaxes(title_text="Inertia", row=1, col=1)
    fig.update_yaxes(title_text="Silhouette Score", row=2, col=1)
    fig.show()
    
    # Find the optimal number of clusters
    inertia_diffs = np.diff(inertias)
    elbow_point = np.argmin(inertia_diffs) + 2  # Add 2 because we started from k=2
    
    max_silhouette = np.argmax(silhouette_scores) + 2  # Add 2 because we started from k=2
    
    # Choose the smaller of the two as it's more conservative
    optimal_k = max_silhouette
    
    print(f"Elbow point suggests {elbow_point} clusters")
    print(f"Max silhouette score suggests {max_silhouette} clusters")
    print(f"Chosen optimal number of clusters: {optimal_k}")
    
    return optimal_k

### Perform Clustering 

In [9]:
def perform_clustering(scaled_features, all_customers, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    all_customers['cluster'] = kmeans.fit_predict(scaled_features)
    
    # Rank clusters by total sales
    cluster_sales = all_customers.groupby('cluster')['sales_amount'].sum().sort_values(ascending=False)
    all_customers['cluster_rank'] = all_customers['cluster'].map(cluster_sales.rank(method='dense', ascending=False).to_dict())
    
    # Split back into current and potential customers
    current_customers_clustered = all_customers[all_customers.index < len(current_customers)]
    potential_customers_clustered = all_customers[all_customers.index >= len(current_customers)]
    
    return current_customers_clustered, potential_customers_clustered, kmeans.cluster_centers_, cluster_sales

#### Function to visualize clusters

In [10]:
def visualize_clusters(current_customers_clustered, potential_customers_clustered, cluster_centers):
    all_customers = pd.concat([current_customers_clustered, potential_customers_clustered], ignore_index=True)
    all_customers['customer_type'] = ['Current'] * len(current_customers_clustered) + ['Potential'] * len(potential_customers_clustered)
    
    n_clusters = len(cluster_centers)
    colors = px.colors.qualitative.Plotly[:n_clusters]
    
    fig = px.scatter_mapbox(all_customers, 
                            lat="latitude", 
                            lon="longitude", 
                            color="cluster",
                            color_discrete_sequence=colors,
                            size="sales_amount",
                            size_max=15,
                            zoom=3,
                            hover_data=["customer_type", "sales_amount", "cluster_rank"],
                            title="Customer Clusters")

    fig.add_scattermapbox(
        lat=cluster_centers[:, 0],
        lon=cluster_centers[:, 1],
        mode='markers',
        marker=dict(size=15, color='black', symbol='star'),
        text=[f"Cluster {i} Center" for i in range(n_clusters)],
        hoverinfo='text',
        showlegend=False
    )

    fig.update_layout(mapbox_style="open-street-map")
    fig.show()


#### Assign current sales rep up to 300 stores

Assigns potential customers to existing sales representatives based on clustering results. 

It aims to optimize the workload of current reps by assigning them new customers from clusters they're already familiar with, up to a maximum number of customers per rep.

In [11]:
def assign_to_current_reps_by_city_and_cluster(current_customers_clustered, potential_customers_clustered, store_df):
    new_assignments = []
    
    for _, rep in store_df.iterrows():
        rep_id = rep['sales_representative_id']
        rep_city = rep['city']
        current_count = rep['og_store_count']
        
        if current_count < MAX_CUSTOMERS_PER_REP:
            # Get clusters where the rep is located within the same city
            rep_clusters = current_customers_clustered[
                (current_customers_clustered['sales_representative_id'] == rep_id) &
                (current_customers_clustered['city'] == rep_city)
            ]['cluster'].unique()
            
            for cluster in rep_clusters:
                # Get potential customers in the same cluster and city
                cluster_potential = potential_customers_clustered[
                    (potential_customers_clustered['cluster'] == cluster) &
                    (potential_customers_clustered['city'] == rep_city)
                ]
                to_assign = min(len(cluster_potential), MAX_CUSTOMERS_PER_REP - current_count)
                assigned = cluster_potential.nlargest(to_assign, 'sales_amount')
                
                new_assignments.append(pd.DataFrame({
                    'id': assigned['id'],
                    'sales_representative_id': rep_id,
                    'city': assigned['city'],
                    'longitude': assigned['longitude'],
                    'latitude': assigned['latitude'],
                    'sales_amount': assigned['sales_amount'],
                    'customer_value': assigned['customer_value'],
                    'assignment_type': 'existing rep'
                }))
                
                potential_customers_clustered = potential_customers_clustered[~potential_customers_clustered['id'].isin(assigned['id'])]
                current_count += len(assigned)
                if current_count >= MAX_CUSTOMERS_PER_REP:
                    break
    
    return pd.concat(new_assignments), potential_customers_clustered

#### Calculate new sales rep to hire

It prioritizes clusters with higher sales potential and calculates the number of new hires needed to cover these clusters, up to a maximum number of new hires.

This helps in making data-driven decisions about expanding the sales team, focusing on areas with the highest potential return on investment.

In [12]:
def calculate_new_hires_by_clusters(cluster_sales, remaining_potential):
    new_hires = 0
    assigned_stores = 0
    total_sales = 0
    
    cluster_sales_sorted = cluster_sales.sort_values(ascending=False)
    
    for cluster, sales in cluster_sales_sorted.items():
        cluster_stores = len(remaining_potential[remaining_potential['cluster'] == cluster])
        hires_needed = -(-cluster_stores // MAX_CUSTOMERS_PER_REP)
        
        if new_hires < MAX_NEW_HIRES:
            hires_to_add = min(hires_needed, MAX_NEW_HIRES - new_hires)
            new_hires += hires_to_add
            stores_to_add = min(cluster_stores, hires_to_add * MAX_CUSTOMERS_PER_REP)
            assigned_stores += stores_to_add
            total_sales += (stores_to_add / cluster_stores) * sales
        
        if new_hires >= MAX_NEW_HIRES:
            break
    
    return new_hires, assigned_stores, total_sales


#### New hires are assigned to stores

It prioritizes clusters with the most remaining potential customers and assigns them to new hires up to the maximum customers per rep.

This ensures that new hires are placed in areas with the highest concentration of potential customers, maximizing their potential impact on sales.

In [13]:
def assign_new_hires_by_clusters(optimal_hires, remaining_potential, cluster_centers):
    new_hire_assignments = []
    
    for i in range(optimal_hires):
        hire_id = f'new_hire_{i+1}'
        # Get the cluster center with the highest remaining potential
        top_cluster = remaining_potential['cluster'].value_counts().idxmax()
        cluster_customers = remaining_potential[remaining_potential['cluster'] == top_cluster]
        assigned = cluster_customers.nlargest(MAX_CUSTOMERS_PER_REP, 'sales_amount')
        
        new_hire_assignments.append(pd.DataFrame({
            'id': assigned['id'],
            'sales_representative_id': hire_id,
            'city': assigned['city'],
            'longitude': assigned['longitude'],
            'latitude': assigned['latitude'],
            'sales_amount': assigned['sales_amount'],
            'customer_value': assigned['customer_value'],
            'assignment_type': 'new hire'
        }))
        
        remaining_potential = remaining_potential[~remaining_potential['id'].isin(assigned['id'])]
    
    return pd.concat(new_hire_assignments, ignore_index=True)


### Main execution

In [14]:
# Main execution
scaled_features, all_customers = prepare_data(current_customers, potential_customers)
optimal_clusters = find_optimal_kmeans_clusters(scaled_features)
current_customers_clustered, potential_customers_clustered, cluster_centers, cluster_sales = perform_clustering(scaled_features, all_customers, optimal_clusters)
visualize_clusters(current_customers_clustered, potential_customers_clustered, cluster_centers)

# Assigning potential customers to current sales reps based on city and clusters
current_assignments, remaining_potential = assign_to_current_reps_by_city_and_cluster(current_customers_clustered, potential_customers_clustered, store_df)

# Calculating new hires needed based on cluster sales
new_hires, new_stores, new_sales = calculate_new_hires_by_clusters(cluster_sales, remaining_potential)

# Assigning new hires based on clusters
new_hire_assignments = assign_new_hires_by_clusters(new_hires, remaining_potential, cluster_centers)


Elbow point suggests 2 clusters
Max silhouette score suggests 20 clusters
Chosen optimal number of clusters: 20


In [15]:
for i, center in enumerate(cluster_centers):
    print(f"Cluster {i} Center: {center}")

Cluster 0 Center: [ 0.49614579  1.4653317  -0.7624392  -0.13300682]
Cluster 1 Center: [ 0.3756843  -0.26510364  1.87712464  0.04706664]
Cluster 2 Center: [-0.03739592 -0.49849257 -0.78032123 -0.03949186]
Cluster 3 Center: [-5.47334022e-02 -3.87811323e-01 -2.21277891e-01  1.83785228e+02]
Cluster 4 Center: [ 1.59052151 -1.89652355 -0.47457824 -0.11239921]
Cluster 5 Center: [-1.72919653 -1.11751072  0.18620228  0.04155616]
Cluster 6 Center: [ 1.40545565 -0.15203038  0.4338929   0.05890705]
Cluster 7 Center: [0.50535592 1.40444827 0.36323438 0.04301492]
Cluster 8 Center: [-0.88006803  0.64542714 -0.73415174 -0.07965247]
Cluster 9 Center: [-1.72951598 -1.1180691  -0.82727702 -0.19802538]
Cluster 10 Center: [0.23791021 0.520297   1.9386658  1.96908645]
Cluster 11 Center: [-0.0418697  -0.49234224  0.25261406  0.0335429 ]
Cluster 12 Center: [-0.88237474  0.64571622  0.35524573  0.04215237]
Cluster 13 Center: [-0.43797431 -0.60418082  2.36680786  3.05361528]
Cluster 14 Center: [ 1.39727865 -0.1

In [16]:
# Assuming cluster_centers is your array of cluster centers
for i, center in enumerate(cluster_centers):
    print(f"\nCluster {i} Center:")
    for j, value in enumerate(center):
        feature_name = all_customers.columns[j]  # Get the feature name
        print(f"  {feature_name}: {value:.4f}")
    
    # Get all customers in this cluster
    cluster_customers = all_customers[all_customers['cluster'] == i]
    
    print(f"\n  Cluster Size: {len(cluster_customers)}")
    print(f"  Average Sales Amount: ${cluster_customers['sales_amount'].mean():.2f}")
    
    # Top 3 cities in this cluster
    top_cities = cluster_customers['city'].value_counts().head(3)
    print("\n  Top 3 cities:")
    for city, count in top_cities.items():
        print(f"    {city}: {count}")
    
    print("\n" + "="*50)


Cluster 0 Center:
  id: 0.4961
  store_category: 1.4653
  store_sub_category: -0.7624
  description: -0.1330

  Cluster Size: 4456
  Average Sales Amount: $28389.66

  Top 3 cities:
    barcelona: 4456


Cluster 1 Center:
  id: 0.3757
  store_category: -0.2651
  store_sub_category: 1.8771
  description: 0.0471

  Cluster Size: 2165
  Average Sales Amount: $40554.18

  Top 3 cities:
    madrid: 1263
    bilbao: 276
    zaragoza: 254


Cluster 2 Center:
  id: -0.0374
  store_category: -0.4985
  store_sub_category: -0.7803
  description: -0.0395

  Cluster Size: 5552
  Average Sales Amount: $28307.40

  Top 3 cities:
    madrid: 4382
    caceres: 563
    valladolid: 516


Cluster 3 Center:
  id: -0.0547
  store_category: -0.3878
  store_sub_category: -0.2213
  description: 183.7852

  Cluster Size: 1
  Average Sales Amount: $30884.23

  Top 3 cities:
    madrid: 1


Cluster 4 Center:
  id: 1.5905
  store_category: -1.8965
  store_sub_category: -0.4746
  description: -0.1124

  Cluster Si

#### Customer Assignment - Calculate sales per customer and threshold

Assigns potential customers to current sales representatives based on the clustering results

we set a threshold at 70% of the first new hire's sales per customer and determines the optimal number of new hires based on this threshold.
This ensures that new hires are only added if they meet a certain performance standard, balancing growth with efficiency

Explain: This threshold (0.7) helps us determine how many new sales reps to hire. We keep "hiring" (in our projection) as long as the sales per customer stays above this threshold.

What the 70% threshold means:
- By setting the threshold at 70% of the first new hire's sales per customer, we're saying that we're willing to hire new sales representatives as long as their expected performance is at least 70% as good as the best available opportunity (represented by the first potential new hire).

Implications of increasing the threshold (e.g., to 90%):

1. More conservative approach
2. Fewer new hires recommended
3. Higher average performance of new hires
4. Slower expansion but potentially more profitable
5. Lower risk of hiring underperforming sales reps
6. Might miss out on some growth opportunities


In [17]:
current_sales = current_customers['sales_amount'].sum()
current_stores = store_df['og_store_count'].sum()
current_spc = current_sales / current_stores

incremental_sales = []
cumulative_sales = []
cumulative_stores = []
sales_per_customer = []

# Calculate stores covered by existing reps
existing_stores = store_df['og_store_count'].sum()
newly_assigned_to_existing = len(current_assignments)
total_existing_rep_stores = existing_stores + newly_assigned_to_existing

unique_cities = remaining_potential['city'].unique()
for i in range(min(new_hires, len(unique_cities))):
    city = unique_cities[i]
    city_customers = remaining_potential[remaining_potential['city'] == city]
    new_sales = city_customers['sales_amount'].nlargest(MAX_CUSTOMERS_PER_REP).sum()
    
    incremental_sales.append(new_sales)
    cumulative_sales.append(sum(incremental_sales))
    cumulative_stores.append((i + 1) * MAX_CUSTOMERS_PER_REP)
    
    sales_per_customer.append(new_sales / MAX_CUSTOMERS_PER_REP)

new_hires = len(incremental_sales)

# Thresholds to analyze
thresholds = [0.7, 0.8, 0.9]

print("Threshold Analysis:")
print("==================")
for threshold_percent in thresholds:
    threshold_spc = sales_per_customer[0] * threshold_percent
    optimal_hires = sum(spc >= threshold_spc for spc in sales_per_customer)
    
    if optimal_hires > 0:
        total_potential_sales = cumulative_sales[optimal_hires-1]
        total_new_stores = cumulative_stores[optimal_hires-1]
    else:
        total_potential_sales = 0
        total_new_stores = 0

    print(f"With {threshold_percent*100}% threshold: Recommend hiring {optimal_hires} new reps")
    print(f"Total potential new sales: ${total_potential_sales:,.2f} and total new stores covered: {total_new_stores}\n")

Threshold Analysis:
With 70.0% threshold: Recommend hiring 12 new reps
Total potential new sales: $144,183,572.67 and total new stores covered: 3600

With 80.0% threshold: Recommend hiring 11 new reps
Total potential new sales: $132,810,701.97 and total new stores covered: 3300

With 90.0% threshold: Recommend hiring 9 new reps
Total potential new sales: $109,801,896.44 and total new stores covered: 2700



In [18]:
print("Sales per customer list:", sales_per_customer)

Sales per customer list: [41601.961760702136, 41504.56749482683, 37383.79591307471, 43851.0986054186, 43306.93874589396, 40998.762902825, 41556.560027491054, 33004.55811454327, 42798.07790792944, 40135.1076865666, 36560.91075606712, 37909.568984640646, 20129.742779618577, 19475.61033777848, 4206.821709170283]


## Strategic Planning and Resource Allocation

Different thresholds help in strategic planning by providing scenarios for resource allocation.

### 1.Conservative Approach: The 70% Threshold 
It is a part of a risk-averse strategy, ensuring minimal acceptable performance before expanding the sales team.
### 2. Balanced Approach: The 80% Threshold 
It can be used to balance between risk and reward, targeting steady growth.
### 3. Aggressive Growth: The 90% Threshold 
It can be part of an aggressive growth strategy, pushing the sales team to maximize performance and capitalize on market opportunities.

In [19]:
# Calculate sales per customer and threshold
current_sales = current_customers['sales_amount'].sum()
current_stores = store_df['og_store_count'].sum()
current_spc = current_sales / current_stores

incremental_sales = []
cumulative_sales = []
cumulative_stores = []
sales_per_customer = []

# Calculate stores covered by existing reps
existing_stores = store_df['og_store_count'].sum()
newly_assigned_to_existing = len(current_assignments)
total_existing_rep_stores = existing_stores + newly_assigned_to_existing

unique_cities = remaining_potential['city'].unique()
for i in range(min(new_hires, len(unique_cities))):
    city = unique_cities[i]
    city_customers = remaining_potential[remaining_potential['city'] == city]
    new_sales = city_customers['sales_amount'].nlargest(MAX_CUSTOMERS_PER_REP).sum()
    
    incremental_sales.append(new_sales)
    cumulative_sales.append(sum(incremental_sales))
    cumulative_stores.append((i + 1) * MAX_CUSTOMERS_PER_REP)
    
    sales_per_customer.append(new_sales / MAX_CUSTOMERS_PER_REP)

new_hires = len(incremental_sales)

# Thresholds to analyze
thresholds = [0.7, 0.8, 0.9]

print("Threshold Analysis:")
print("==================")
for threshold_percent in thresholds:
    threshold_spc = sales_per_customer[0] * threshold_percent
    optimal_hires = sum(spc >= threshold_spc for spc in sales_per_customer)
    
    if optimal_hires > 0:
        total_potential_sales = cumulative_sales[optimal_hires-1]
        total_new_stores = cumulative_stores[optimal_hires-1]
    else:
        total_potential_sales = 0
        total_new_stores = 0

    print(f"With {threshold_percent*100}% threshold: Recommend hiring {optimal_hires} new reps")
    print(f"Total potential new sales: ${total_potential_sales:,.2f} and total new stores covered: {total_new_stores}\n")


Threshold Analysis:
With 70.0% threshold: Recommend hiring 12 new reps
Total potential new sales: $144,183,572.67 and total new stores covered: 3600

With 80.0% threshold: Recommend hiring 11 new reps
Total potential new sales: $132,810,701.97 and total new stores covered: 3300

With 90.0% threshold: Recommend hiring 9 new reps
Total potential new sales: $109,801,896.44 and total new stores covered: 2700



In [20]:
# Calculate sales per customer and threshold
current_sales = current_customers['sales_amount'].sum()
current_stores = store_df['og_store_count'].sum()
current_spc = current_sales / current_stores

incremental_sales = []
cumulative_sales = []
cumulative_stores = []
sales_per_customer = []

# Calculate stores covered by existing reps
existing_stores = store_df['og_store_count'].sum()
newly_assigned_to_existing = len(current_assignments)
total_existing_rep_stores = existing_stores + newly_assigned_to_existing

unique_cities = remaining_potential['city'].unique()
for i in range(min(new_hires, len(unique_cities))):
    city = unique_cities[i]
    city_customers = remaining_potential[remaining_potential['city'] == city]
    new_sales = city_customers['sales_amount'].nlargest(MAX_CUSTOMERS_PER_REP).sum()
    
    incremental_sales.append(new_sales)
    cumulative_sales.append(sum(incremental_sales))
    cumulative_stores.append((i + 1) * MAX_CUSTOMERS_PER_REP)
    
    sales_per_customer.append(new_sales / MAX_CUSTOMERS_PER_REP)

new_hires = len(incremental_sales)

if sales_per_customer:
    threshold_spc = sales_per_customer[0] * 0.80
    optimal_hires = sum(spc >= threshold_spc for spc in sales_per_customer)
else:
    print("Warning: No potential new hires found.")
    threshold_spc = 0
    optimal_hires = 0

total_new_sales = sum(incremental_sales[:optimal_hires])
total_new_stores = (optimal_hires * MAX_CUSTOMERS_PER_REP) + newly_assigned_to_existing
grand_total_stores = total_existing_rep_stores + (optimal_hires * MAX_CUSTOMERS_PER_REP)

print(f"Original stores covered by existing reps: {existing_stores}")
print(f"Newly assigned stores to existing reps: {newly_assigned_to_existing}")
print(f"Total stores covered by existing reps: {total_existing_rep_stores}")
print(f"Stores covered by new hires: {optimal_hires * MAX_CUSTOMERS_PER_REP}")
print(f"Total new stores covered (newly assigned + new hires): {total_new_stores}")
print(f"Grand total of all stores covered: {grand_total_stores}")
print(f"Optimal hires to reach threshold: {optimal_hires}")

Original stores covered by existing reps: 9654
Newly assigned stores to existing reps: 518
Total stores covered by existing reps: 10172
Stores covered by new hires: 3300
Total new stores covered (newly assigned + new hires): 3818
Grand total of all stores covered: 13472
Optimal hires to reach threshold: 11


### Analysis and Visualization

In [21]:
def create_visualizations():
    fig1 = go.Figure()
    fig1.add_trace(go.Scatter(x=list(range(1, len(sales_per_customer) + 1)), y=sales_per_customer, mode='lines+markers', name='Sales per Customer'))
    fig1.add_trace(go.Scatter(x=[1, len(sales_per_customer)], y=[threshold_spc, threshold_spc], mode='lines', name='Threshold', line=dict(color='red', dash='dash')))
    fig1.add_vline(x=optimal_hires, line_dash="dash", line_color="green", annotation_text="Optimal Hires")
    fig1.update_layout(title='Sales per Customer for Each New Hire', xaxis_title='Number of New Hires', yaxis_title='Sales per Customer ($)')
    fig1.show()

    growth_rates = [(current_sales + sales) / current_sales - 1 for sales in cumulative_sales]
    fig2 = go.Figure()
    fig2.add_trace(go.Scatter(x=list(range(1, len(growth_rates) + 1)), y=growth_rates, mode='lines+markers', name='Growth Rate'))
    fig2.add_vline(x=optimal_hires, line_dash="dash", line_color="green", annotation_text="Optimal Hires")
    fig2.update_layout(title='Revenue Growth Rate vs Number of New Hires', xaxis_title='Number of New Hires', yaxis_title='Revenue Growth Rate (%)')
    fig2.show()

    new_hire_sales = [current_sales + cum_sales for cum_sales in cumulative_sales]
    marginal_benefit = [new_hire_sales[0] - current_sales] + [new_hire_sales[i] - new_hire_sales[i-1] for i in range(1, len(new_hire_sales))]
    
    fig3 = make_subplots(specs=[[{"secondary_y": True}]])
    fig3.add_trace(go.Scatter(x=list(range(1, len(new_hire_sales) + 1)), y=new_hire_sales, mode='lines+markers', name='Cumulative Sales'), secondary_y=False)
    fig3.add_trace(go.Scatter(x=list(range(1, len(marginal_benefit) + 1)), y=marginal_benefit, mode='lines+markers', name='Marginal Benefit'), secondary_y=True)
    fig3.add_vline(x=optimal_hires, line_dash="dash", line_color="green", annotation_text=f"Optimal Hires ({optimal_hires})")
    fig3.update_layout(title='Sales Growth and Marginal Benefit per New Hire', xaxis_title='Number of New Hires')
    fig3.update_yaxes(title_text="Cumulative Sales ($)", secondary_y=False)
    fig3.update_yaxes(title_text="Marginal Benefit ($)", secondary_y=True)
    fig3.show()

    current_reps = len(current_customers['sales_representative_id'].unique())
    new_hire_sales = [current_sales + cum_sales for cum_sales in cumulative_sales]

    x_current = list(range(1, current_reps + 1))
    x_new = list(range(current_reps + 1, current_reps + len(new_hire_sales) + 1))

    def log_curve(x, a, b):
        return a * np.log(x) + b

    a = current_sales / np.log(current_reps)
    b = 0
    y_current = [log_curve(x, a, b) for x in x_current]

    fig4 = go.Figure()

    fig4.add_trace(go.Scatter(
        x=x_current,
        y=y_current,
        mode='lines',
        name='Current Sales Reps',
        line=dict(color='blue')
    ))

    fig4.add_trace(go.Scatter(
        x=x_new,
        y=new_hire_sales,
        mode='lines',
        name='With New Hires',
        line=dict(color='red')
    ))

    fig4.update_layout(
        title='Sales Amount vs Number of Sales Representatives',
        xaxis_title='Number of Sales Representatives',
        yaxis_title='Sales Amount (€)',
        showlegend=True,
        xaxis=dict(tickmode='linear', dtick=5),
        yaxis=dict(tickformat=',.0f'),
    )

    fig4.show()

create_visualizations()


#### Assigns the optimal number of new hires to cities with the most potential customers

In [22]:
new_hire_assignments = []
for i in range(optimal_hires):
    hire_id = f'new_hire_{i+1}'
    city = remaining_potential['city'].value_counts().index[i]
    city_customers = remaining_potential[remaining_potential['city'] == city]
    assigned = city_customers.nlargest(MAX_CUSTOMERS_PER_REP, 'sales_amount')
    
    new_hire_assignments.append(pd.DataFrame({
        'id': assigned['id'],
        'sales_representative_id': hire_id,
        'city': assigned['city'],
        'longitude': assigned['longitude'],
        'latitude': assigned['latitude'],
        'sales_amount': assigned['sales_amount'],
        'customer_value': assigned['customer_value'],
        'assignment_type': 'new hire'
    }))
    
    remaining_potential = remaining_potential[~remaining_potential['id'].isin(assigned['id'])]

new_hire_assignments = pd.concat(new_hire_assignments, ignore_index=True)


In [23]:
# Calculate growth rates
growth_rates = [((current_sales + sales) / current_sales - 1)*100 for sales in cumulative_sales]

In [24]:
# Business Explanation
print("\nBusiness Explanation:")
print(f"""
Based on our analysis of sales per customer and revenue growth rate, we recommend hiring {optimal_hires} new sales representatives. This expansion strategy would allow us to cover {total_new_stores} new stores and potentially increase our sales by ${total_new_sales:,.2f}.

Key Points:
1. Expansion Potential: Each new hire is expected to bring in sales above our threshold of ${threshold_spc:.2f} per customer, ensuring that our expansion remains profitable while significantly increasing our market presence.
2. Aggressive Growth: This strategy aligns with our goal to expand rapidly, as we're recommending a substantial increase in our sales force from the current {current_customers['sales_representative_id'].nunique()} to {current_customers['sales_representative_id'].nunique() + optimal_hires} representatives.
3. Revenue Growth: With {optimal_hires} new hires, we project a revenue growth rate of {growth_rates[optimal_hires-1]:.2f}%. This significant increase in revenue supports our expansion goals.
4. Diminishing Returns: While we observe some diminishing returns in both sales per customer and revenue growth rate as we hire more representatives, the recommended hires all contribute to substantial growth.
5. Market Coverage: This expansion would allow us to cover {total_new_stores} additional stores, greatly increasing our market penetration and potential for future growth.""")


Business Explanation:

Based on our analysis of sales per customer and revenue growth rate, we recommend hiring 11 new sales representatives. This expansion strategy would allow us to cover 3818 new stores and potentially increase our sales by $132,810,701.97.

Key Points:
1. Expansion Potential: Each new hire is expected to bring in sales above our threshold of $33281.57 per customer, ensuring that our expansion remains profitable while significantly increasing our market presence.
2. Aggressive Growth: This strategy aligns with our goal to expand rapidly, as we're recommending a substantial increase in our sales force from the current 33 to 44 representatives.
3. Revenue Growth: With 11 new hires, we project a revenue growth rate of 43.22%. This significant increase in revenue supports our expansion goals.
4. Diminishing Returns: While we observe some diminishing returns in both sales per customer and revenue growth rate as we hire more representatives, the recommended hires all con

In [25]:
print("Considerations and Next Steps:")
print(f"""

1. Phased Implementation: We recommend implementing this expansion in phases, closely monitoring the performance of each new cohort of hires to ensure we're meeting our sales targets and growth projections.

2. Cost Analysis: While we don't have specific costs for hiring and training, management should conduct a detailed cost-benefit analysis to ensure the expansion remains profitable, considering the projected revenue growth.

3. Infrastructure and Support: Assess and plan for the additional infrastructure and support needed to manage this significant expansion effectively.

4. Market Saturation: Monitor for signs of market saturation as we expand, and be prepared to adjust our strategy if the actual revenue growth rate falls short of projections.

5. Training and Quality: Ensure we maintain the quality of our sales force by implementing robust training programs for the new hires to achieve the projected sales figures.

6. Regional Focus: Analyze the performance data to identify high-potential regions where we might want to concentrate our expansion efforts for maximum revenue growth.

7. Long-term Strategy: Consider how this rapid expansion aligns with our long-term business strategy and be prepared to adjust our growth targets as we capture more market share.

This expansion strategy presents a significant opportunity for growth, with a projected {growth_rates[optimal_hires-1]:.2f}% increase in revenue. However, it's crucial to implement it thoughtfully and monitor its progress closely. Regular review and adjustment of the plan will be key to its success, ensuring that we capitalize on the potential growth while managing the risks associated with rapid expansion.
""")

Considerations and Next Steps:


1. Phased Implementation: We recommend implementing this expansion in phases, closely monitoring the performance of each new cohort of hires to ensure we're meeting our sales targets and growth projections.

2. Cost Analysis: While we don't have specific costs for hiring and training, management should conduct a detailed cost-benefit analysis to ensure the expansion remains profitable, considering the projected revenue growth.

3. Infrastructure and Support: Assess and plan for the additional infrastructure and support needed to manage this significant expansion effectively.

4. Market Saturation: Monitor for signs of market saturation as we expand, and be prepared to adjust our strategy if the actual revenue growth rate falls short of projections.

5. Training and Quality: Ensure we maintain the quality of our sales force by implementing robust training programs for the new hires to achieve the projected sales figures.

6. Regional Focus: Analyze the p

#### Combine and save to CSV

In [26]:
all_assignments_1 = pd.concat([current_assignments, new_hire_assignments], ignore_index=True)

In [27]:
heatmap_fig = px.scatter_mapbox(
    all_assignments_1, 
    lat="latitude", 
    lon="longitude", 
    color="sales_representative_id",   
    height=500,
    title="Heatmap of all store categories"
)

heatmap_fig.update_layout(
    mapbox_style="open-street-map"
)

heatmap_fig.show()
fig = px.histogram(all_assignments_1, x='sales_amount', color='customer_value', barmode='overlay')
fig.update_layout(title='Histogram of Sales Amount')
fig.show()









In [28]:
all_assignments_1['customer_value'].value_counts()

customer_value
Very High    2908
Medium        491
High          310
Low           109
Name: count, dtype: int64

In [29]:
all_assignments_1['sales_representative_id'].value_counts()

sales_representative_id
new_hire_11    300
new_hire_5     300
new_hire_10    300
new_hire_9     300
new_hire_8     300
new_hire_7     300
new_hire_6     300
new_hire_1     300
new_hire_2     300
new_hire_3     300
new_hire_4     300
27              71
9               70
17              70
24              70
8               47
1               43
28              37
11              36
4               27
3               17
13              16
18              14
Name: count, dtype: int64