In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.decomposition import PCA
import seaborn as sns

from ipywidgets import interact, FloatSlider

import plotly.express as px

In [45]:
pd.set_option('display.max_columns', None)

# HW:
The data set includes the churn of customers of a telecommunications company.
The task is to create segments from customers based on their characteristics using the KMeans algorithm.

Do not use the following variables for grouping:
- churn?: has the customer dropped out?
- Contract_date: contract conclusion time
- Cust_ID: customer ID

In [46]:
file_path = "./telco_sampled.csv"
df = pd.read_csv(file_path, sep = ';')

In [None]:
df.head()

In [None]:
df['churn?'].value_counts()

# 1. Subtask: (data preparation)
Use all variables except for the three variables above when creating the clusters. Perform data preparation so that the variables are input to the model in the appropriate form.

(hint: categorical variables, missing values, scaling, etc.)

In [49]:
# Exclude unnecessary columns
df = df.drop(['churn?', 'Contract_date', 'Cust_ID'], axis=1)

# Handle categorical variables
df = pd.get_dummies(df, drop_first=True)

# Identify and handle missing values
df.fillna(df.mean(), inplace=True)  # Numerical
df = df.apply(lambda x: x.fillna(x.mode()[0]) if x.dtype == 'object' else x)  # Categorical

# Scaling
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)


# 2. Subtask: (clustering)
Find the optimal k value for the KMeans algorithm using the variables prepared in the previous task. Then group the customers.

In [None]:
inertias = []
for k in range(1, 15):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    inertias.append(kmeans.inertia_)

plt.plot(range(1, 15), inertias, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.show()



# Fromt the plot, we find that k = 3 or k = 4 is optimal
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(scaled_data)


# 3. Subtask: (explaination of clusters / conclusions)
Try to find an explanation of what characterizes each group and what characteristics caused each customer to be in the given cluster.

In [None]:
# Analyze cluster characteristics
cluster_profiles = df.groupby('Cluster').mean()

# Decision tree to explain clusters
X = scaled_data
y = df['Cluster']
clf = DecisionTreeClassifier()
clf.fit(X, y)

plt.figure(figsize=(20, 10))
plot_tree(clf, feature_names=df.columns, filled=True)
plt.show()



In [None]:
# Reduce to 2 dimensions for visualization
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_data)

# Create a DataFrame for visualization
df_pca = pd.DataFrame(data=pca_components, columns=['PC1', 'PC2'])
df_pca['Cluster'] = df['Cluster']

# Visualize clusters in 2D
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Cluster', palette='Set2', data=df_pca, s=60)
plt.title('Customer Clusters Visualized using PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.show()

In [None]:
# Step 1: Rebuild preprocessed_data without the 'Cluster' column
columns_to_use = [col for col in df.columns if col not in ['Cust_ID', 'churn?', 'Contract_date', 'Cluster']]
preprocessed_data = df[columns_to_use]

# Verify shapes again
print("Shape of preprocessed data:", preprocessed_data.shape)  # Should now match (1565, 72)

# Step 2: Re-align columns and create centroids DataFrame
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=preprocessed_data.columns)

# Step 3: Re-run cluster profile examination
df['Cluster'] = kmeans.labels_
cluster_profiles = df.groupby('Cluster')[preprocessed_data.columns].mean()

# Display the profiles
print(cluster_profiles)


In [None]:
columns_to_use = [
    'Age', 'Peak_minute_09', 'Weekend_minute_09', 'Offpeak_minute_09',
    'Offpeak_nr_09', 'Peak_nr_09', 'Weekend_nr_09', 'Selfnet_minute_09',
    # Add other relevant columns...
]

# Assuming clusters are already added in 'Cluster' column
df_clusters = df[columns_to_use + ['Cluster']]

# Step 2: Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_clusters[columns_to_use])
df_clusters['PCA1'] = pca_result[:, 0]
df_clusters['PCA2'] = pca_result[:, 1]

# Step 3: Plot
plt.figure(figsize=(10, 7))
sns.scatterplot(data=df_clusters, x='PCA1', y='PCA2', hue='Cluster', palette='Set1', alpha=0.7)
plt.title('PCA of Clustered Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.show()

In [None]:
# Assume df is your DataFrame and has the cluster labels added
# Run PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df.drop(columns='Cluster'))  # Drop cluster column for PCA

# Add PCA results to DataFrame
df['PC1'] = pca_result[:, 0]
df['PC2'] = pca_result[:, 1]

# Function to plot the scatter plot
def plot_pca(variable):
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x='PC1', y='PC2', hue='Cluster', palette='viridis', alpha=0.6)
    plt.title(f'PCA Scatter Plot - {variable}')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    
    # Set x and y limits to ensure consistency across different variable plots
    plt.xlim(df['PC1'].min() - 1, df['PC1'].max() + 1)
    plt.ylim(df['PC2'].min() - 1, df['PC2'].max() + 1)
    
    # Display the variable as a reference (optional)
    plt.axhline(y=0, color='k', linestyle='--', lw=0.8)
    plt.axvline(x=0, color='k', linestyle='--', lw=0.8)
    
    # Display the plot
    plt.show()

# Create slider for variable selection
variable_slider = FloatSlider(min=0, max=len(df.columns)-2, step=1, value=0)

# Interact function to update the plot based on the selected variable
interact(plot_pca, variable=variable_slider)


In [None]:
# # Conclusion

# Cluster 0: Moderate Users
#   Age: Average age of around 31.5 years.
#   Usage Pattern: Moderate across all usage categories (peak, weekend, off-peak). They use around 67 minutes during peak times and 61 minutes during off-peak, with a balanced distribution across different times.
#   Voice Services: Moderate engagement with voicemail, around 169 minutes per month. Other mobile minutes are relatively average (85 minutes).
#   SMS: Average SMS usage (45-51 messages across different months).
#   Packages: Most common package is "PACK_B" (39%), with significant usage of "PACK_X" (24%).
#   Gender & Demographics: Predominantly male (64%) and married (59%). A majority own their living condition (78%).
#   Job & Income: High representation among public employees (38%) and laborers (38%). Around 38% have an income in the 30-60k range.
#       Conclusion: Cluster 0 represents moderately active users who have balanced usage across different services. They are typically working-class individuals, predominantly male, and inclined towards packages offering flexibility like "PACK_B."
# 
# Cluster 1: Low Users
#   Age: Slightly older, average age of around 34.7 years.
#   Usage Pattern: Significantly lower usage across all metrics. For instance, peak minutes are just around 12, and off-peak minutes are also low (~9). Voicemail usage is similarly minimal (29 minutes).
#   Voice Services: Very low mobile and fixed-line minutes.
#   SMS: Low SMS usage (~10 messages).
#   Packages: High proportion using "PACK_B" (43%) and "PACK_X" (24%).
#   Gender & Demographics: A bit more evenly distributed by gender (58% male). Higher tendency to be married (66%) and own their residence (85%).
#   Job & Income: Broad range of jobs, with notable representation among public employees (36%) and a smaller proportion of retired individuals (5%). Income tends to be in the 30-60k range (38%) but with a higher-than-average percentage below 15k (14%).
#       Conclusion: Cluster 1 appears to capture low-usage customers. They tend to be slightly older, stable (married, homeowners), and economically varied, with many favoring basic and budget-friendly packages.
# 
# Cluster 2: Heavy Users
#   Age: Similar to Cluster 0, average age is around 31.6 years.
#   Usage Pattern: Significantly higher usage across all categories. For example, 234 peak minutes, 172 off-peak minutes, and voicemail minutes reaching over 500 in a month. This group makes extensive use of their services.
#   Voice Services: Much higher mobile minutes (~327) and frequent use of voicemail (~377).
#   SMS: Heavier SMS users, averaging over 100 messages per month.
#   Packages: Preference for "PACK_X" (37%) and "PACK_B" (24%), with higher diversity across package usage than the other clusters.
#   Gender & Demographics: Predominantly male (73%) and, while married (56%), have a higher proportion of single individuals (32%).
#   Job & Income: Higher presence among public employees (48%) and a lower number of laborers. More diverse income range, with a notable proportion earning over 60k (13%).
#       Conclusion: Cluster 2 represents high-usage customers who frequently use voice and SMS services. These customers are predominantly male, more likely to be single, and opt for packages with broader services like "PACK_X." They show a more diverse economic profile, including higher earners.
# 
# General Observations
#   Age and Usage: The usage levels do not seem to vary dramatically with age, suggesting that service engagement is more lifestyle-driven.
#   Gender Differences: Across clusters, males are predominant, but the level of male dominance is highest among heavy users (Cluster 2).
#   Income: Higher earners are more present in Cluster 2, while Cluster 1 has a mix that includes lower-income groups.
#   Service & Package Preferences: Heavy users lean toward packages that provide extensive coverage or perks, whereas lighter users prefer basic or more economical options.