<a href="https://colab.research.google.com/github/linhb03/Ai118Project/blob/dev/Customer_Segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

For this project, I created a fictional dataset to simulate customer information. The dataset includes features such as age, annual income (USD), and spending score (1–100). These variables were designed to reflect realistic customer behaviors for segmentation purposes.

In [8]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from google.colab import files

print("Please upload your customer data CSV file.")
uploaded = files.upload()
file_name = next(iter(uploaded))
df = pd.read_csv(file_name)

df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

print("\nData uploaded and cleaned successfully:")
print("New column names:", df.columns.tolist())
print(df.head())
print("\n---")

features_to_cluster = ['age', 'annual_income_(usd)', 'spending_score_(1-100)']
X = df[features_to_cluster]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

inertia = []
K = range(1, 8)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)


plt.figure(figsize=(8, 5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.savefig('elbow_plot.png')
plt.close()

optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
df['cluster'] = kmeans.fit_predict(X_scaled)

print("\nClustering complete:")
print(df.head())
print("\n---")

print("Cluster Analysis:")
print(df.groupby('cluster')[features_to_cluster].mean())
print("\n---")

if len(features_to_cluster) >= 2:
    plt.figure(figsize=(10, 6))
    feature1_index = features_to_cluster.index('annual_income_(usd)') if 'annual_income_(usd)' in features_to_cluster else 0
    feature2_index = features_to_cluster.index('spending_score_(1-100)') if 'spending_score_(1-100)' in features_to_cluster else (1 if len(features_to_cluster) > 1 else 0)

    plt.scatter(df[features_to_cluster[feature1_index]], df[features_to_cluster[feature2_index]], c=df['cluster'], cmap='viridis')
    plt.xlabel(features_to_cluster[feature1_index])
    plt.ylabel(features_to_cluster[feature2_index])
    plt.title('Customer Clusters')
    plt.savefig('customer_clusters.png')
    plt.close()
    print("Cluster visualization 'customer_clusters.png' created (using first two features).")
    print("---")

Please upload your customer data CSV file.


Saving Customer Segmentation3.csv to Customer Segmentation3 (5).csv

Data uploaded and cleaned successfully:
New column names: ['customerid', 'age', 'region', 'annual_income_(usd)', 'spending_score_(1-100)']
   customerid  age region  annual_income_(usd)  spending_score_(1-100)
0           1   35  North               120000                      15
1           2   50   East               135000                      20
2           3   45   West               110000                      18
3           4   36  South                95000                      85
4           5   38  South               115000                      92

---

Clustering complete:
   customerid  age region  annual_income_(usd)  spending_score_(1-100)  \
0           1   35  North               120000                      15   
1           2   50   East               135000                      20   
2           3   45   West               110000                      18   
3           4   36  South                95