In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load the updated dataset
df = pd.read_csv('../data/ifood_df_updated.csv')

# Define the color palette
custom_palette = ["#067A46", "#242424", "#FFFFFF", "#90B33A"]
# Set this as the default color palette for all seaborn plots
sns.set_palette(custom_palette)

# Select the features for clustering
features = [
    'Income', 'MntTotal', 'TotalPurchases', 'Age', 'Recency', 'Kids'
]
X = df[features]

# Handle outliers (a common preprocessing step)
# Let's remove an outlier found during previous analysis (Income > 100,000)
# A high income value can skew the clustering process.
X = X[X['Income'] < 100000]

# Standardize the data
# K-Means is sensitive to the scale of the data, so we scale it to have
# a mean of 0 and a standard deviation of 1.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Determine the optimal number of clusters using the Elbow Method
inertia = []
for n_clusters in range(1, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Method
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.xticks(range(1, 11))

# Save the plot
plt.savefig('../reports/elbow_method.png')
print("Elbow Method plot has been generated and saved to 'reports/elbow_method.png'.")

In [None]:
import seaborn as sns

# Select the same features used for the Elbow Method
features = [
    'Income', 'MntTotal', 'TotalPurchases', 'Age', 'Recency', 'Kids'
]
X = df[features]

# Remove the outlier to ensure consistent clustering with the previous step
X = X[X['Income'] < 100000]
df_clustered = df[df['Income'] < 100000].copy()

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply K-Means with the optimal number of clusters (3)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df_clustered['cluster'] = kmeans.fit_predict(X_scaled)

# Create a scatter plot to visualize the clusters
# We'll plot Income vs. MntTotal, as they were strongly correlated
plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=df_clustered, 
    x='Income', 
    y='MntTotal', 
    hue='cluster', 
    palette='viridis', 
    style='cluster', 
    s=100
)
plt.title('Customer Segments (K-Means)')
plt.xlabel('Income ($)')
plt.ylabel('Total Amount Spent ($)')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig('../reports/customer_segments_scatter.png')

# Now, profile the clusters to understand their characteristics
cluster_profile = df_clustered.groupby('cluster')[features].mean().reset_index()
print("Cluster Profiles:")
print(cluster_profile)

# Visualize the cluster profiles
cluster_profile_melted = pd.melt(cluster_profile, id_vars='cluster', var_name='Metric', value_name='Value')

plt.figure(figsize=(15, 6))
sns.barplot(data=cluster_profile_melted, x='Metric', y='Value', hue='cluster', palette='viridis')
plt.title('Comparison of Customer Segments')
plt.xlabel('Metric')
plt.ylabel('Average Value')
plt.xticks(rotation=45)
plt.legend(title='Cluster')
plt.tight_layout()
plt.savefig('../reports/cluster_profiles_bar.png')

print("Cluster profiles have been generated and saved.")

In [None]:
# Remove the income outlier for consistency with your previous steps
df = df[df['Income'] < 100000]

# Select the features used for clustering
features = [
    'Income', 'MntTotal', 'TotalPurchases', 'Age', 'Recency', 'Kids'
]
X = df[features]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Re-run K-Means with the optimal number of clusters (3)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Set the visualization style
sns.set_style("whitegrid")

# Create a figure with three box plots side-by-side
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Box plot for Income vs. Cluster
sns.boxplot(x='cluster', y='Income', hue='cluster', data=df, ax=axes[0], palette='viridis')
axes[0].set_title('Income Distribution by Cluster')
axes[0].set_xlabel('Cluster')
axes[0].set_ylabel('Income ($)')

# Box plot for Total Spending vs. Cluster
sns.boxplot(x='cluster', y='MntTotal', hue='cluster', data=df, ax=axes[1], palette='viridis')
axes[1].set_title('Total Spending by Cluster')
axes[1].set_xlabel('Cluster')
axes[1].set_ylabel('Total Amount Spent ($)')

# Box plot for Total Purchases vs. Cluster
sns.boxplot(x='cluster', y='TotalPurchases', hue='cluster', data=df, ax=axes[2], palette='viridis')
axes[2].set_title('Total Purchases by Cluster')
axes[2].set_xlabel('Cluster')
axes[2].set_ylabel('Total Purchases')

plt.tight_layout()
plt.savefig('../reports/cluster_boxplots.png')


# Now, create a grouped bar plot to compare product spending
# First, calculate the average spending on key products for each cluster
product_spending = df.groupby('cluster')[['MntWines', 'MntMeatProducts', 'MntFruits']].mean().reset_index()

# Reshape the data for plotting
product_spending_melted = product_spending.melt('cluster', var_name='Product Type', value_name='Average Spend')

plt.figure(figsize=(10, 6))
sns.barplot(x='Product Type', y='Average Spend', hue='cluster', data=product_spending_melted, palette='viridis')
plt.title('Average Spending on Key Products by Cluster')
plt.xlabel('Product Type')
plt.ylabel('Average Spend ($)')
plt.legend(title='Cluster')
plt.savefig('../reports/cluster_product_spending.png')

print("Additional visualizations have been generated and saved to 'reports/cluster_boxplots.png' and 'reports/cluster_product_spending.png'.")