In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
data = pd.read_csv('Mall_Customers.csv')
data

In [None]:
# Check for null values
print("Null Values:")
print(data.isnull().sum())

In [None]:
# Descriptive statistics
print("Descriptive Statistics:")
print(data.describe())

In [None]:
data.shape

In [None]:
data.columns

In [None]:
# Explore unique values in 'Spending Score (1-100)' and 'Annual Income (k$)'
print("Unique Values in Spending Score:")
print(data['Spending Score (1-100)'].value_counts().unique())

In [None]:
print("Value Counts for Annual Income:")
print(data['Annual Income (k$)'].value_counts())

In [None]:
# Select relevant features for clustering
features = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
X = data[features]

In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Initialize K-means clustering algorithm
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

In [None]:
# Fit model to scaled data
kmeans.fit(X_scaled)
# Add cluster labels to the original DataFrame
data['Cluster'] = kmeans.labels_


In [None]:
# Visualize clusters
plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'], c=data['Cluster'], cmap='rainbow')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score (1-100)')
plt.title('K-means Clustering of Customers')
plt.show()

In [None]:
# Print cluster centers
cluster_centers_scaled = kmeans.cluster_centers_
print("Cluster Centers (Scaled):")
print(cluster_centers_scaled)
cluster_centers_original = scaler.inverse_transform(cluster_centers_scaled)
print("Cluster Centers (Original):")
print(cluster_centers_original)

In [None]:
# Visualize cluster profiles
for cluster_id in range(num_clusters):
    cluster_data = data[data['Cluster'] == cluster_id]
    plt.scatter(cluster_data['Annual Income (k$)'], cluster_data['Spending Score (1-100)'], label=f'Cluster {cluster_id}')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score (1-100)')
plt.title('Cluster Profiles')
plt.legend()
plt.show()

In [None]:
# Box plots for each feature within each cluster
for feature in features:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x='Cluster', y=feature, data=data)
    plt.title(f'Box Plot of {feature} by Cluster')
    plt.show()

In [None]:
# Pair plot colored by cluster
sns.pairplot(data=data, hue='Cluster', diag_kind='kde')
plt.title('Pair Plot Colored by Cluster')
plt.show()

In [None]:
# Distribution plots for each feature within each cluster
for feature in features:
    plt.figure(figsize=(8, 6))
    for cluster_id in range(num_clusters):
        cluster_data = data[data['Cluster'] == cluster_id]
        sns.histplot(cluster_data[feature], label=f'Cluster {cluster_id}', kde=True)
    plt.title(f'Distribution of {feature} by Cluster')
    plt.legend()
    plt.show()

In [None]:
# Function to perform K-means clustering
def perform_clustering():
    try:
        num_clusters = int(cluster_entry.get())
        data = pd.read_csv(r'E:\prodegy\TASK2\PRODIGY_ML_02\Mall_Customers.csv')
        X = data[['Annual Income (k$)', 'Spending Score (1-100)']]
        
        kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)
        labels = kmeans.labels_
        
        plt.figure(figsize=(8, 6))
        plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
        plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
        plt.xlabel('Annual Income (k$)')
        plt.ylabel('Spending Score (1-100)')
        plt.title('K-means Clustering')
        plt.show()

    except ValueError:
        messagebox.showerror("Error", "Please enter a valid number of clusters.")

In [None]:
#GUI setup
import tkinter as tk
from tkinter import ttk
from tkinter import messagebox

In [34]:
# Function to perform K-means clustering
def perform_clustering():
    try:
        num_clusters = int(cluster_entry.get())
        data = pd.read_csv('Mall_Customers.csv')
        X = data[['Annual Income (k$)', 'Spending Score (1-100)']]
        
        kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)
        labels = kmeans.labels_
        
        plt.figure(figsize=(8, 6))
        plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels, cmap='viridis')
        plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
        plt.xlabel('Annual Income (k$)')
        plt.ylabel('Spending Score (1-100)')
        plt.title('K-means Clustering')
        plt.show()

    except ValueError:
        messagebox.showerror("Error", "Please enter a valid number of clusters.")

root = tk.Tk()
root.title("Customer Segmentation")

frame = ttk.Frame(root, padding="20")
frame.grid(column=0, row=0, sticky=(tk.W, tk.E, tk.N, tk.S))

ttk.Label(frame, text="Number of Clusters:").grid(column=0, row=0, sticky=tk.W)
cluster_entry = ttk.Entry(frame)
cluster_entry.grid(column=1, row=0)

cluster_button = ttk.Button(frame, text="Perform Clustering", command=perform_clustering)
cluster_button.grid(column=0, row=1, columnspan=2)

root.mainloop()