# Clustering Analysis

This notebook implements K-Means clustering with PCA to segment financial data into meaningful groups. We'll identify natural patterns in the data and characterize different performance profiles.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import pickle

# Machine learning libraries
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Import custom modules
import sys
import os
sys.path.append(os.path.abspath('../src'))
from ml_utils import plot_elbow_method, plot_pca_components, plot_clusters

# Set plotting style
sns.set(style='whitegrid')
plt.style.use('seaborn-v0_8-whitegrid')

# Ignore warnings
warnings.filterwarnings('ignore')

# Create directories for saving results if they don't exist
Path('../results/models').mkdir(parents=True, exist_ok=True)
Path('../results/plots/ml').mkdir(parents=True, exist_ok=True)

## 1. Load and Prepare Data

In [None]:
# Load the cleaned data
cleaned_data_path = '../data/processed/cleaned_data.csv'
df = pd.read_csv(cleaned_data_path)

# Strip spaces from column names
df.columns = df.columns.str.strip()

# Replace spaces in column names with underscores (if needed)
column_mapping = {col: col.replace(' ', '_') for col in df.columns if ' ' in col}
if column_mapping:
    df = df.rename(columns=column_mapping)
    print("Columns renamed to replace spaces with underscores:")
    for old, new in column_mapping.items():
        print(f"  '{old}' → '{new}'")

# Define financial columns for cleaning
financial_cols = ['Sales', 'COGS', 'Profit']

# Ensure all financial columns exist in the dataset
missing_financial_cols = [col for col in financial_cols if col not in df.columns]
if missing_financial_cols:
    raise ValueError(f"Missing required financial columns: {missing_financial_cols}")

# Clean financial columns by removing non-numeric characters and handling NaNs
for col in financial_cols:
    # Remove $, commas, and other non-numeric characters
    df[col] = df[col].astype(str).str.replace(r'[^0-9.-]', '', regex=True)
    
    # Replace empty strings with NaN
    df[col] = df[col].replace('', np.nan)
    
    # Convert to numeric, coercing invalid entries to NaN
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with NaN in financial columns (or fill as needed)
df = df.dropna(subset=financial_cols)

# Verify data types after cleaning
print("\nData Types After Cleaning:")
print(df[financial_cols].dtypes)

# Print cleaned financial columns for verification
print("\nCleaned Financial Columns (First 5 Rows):")
print(df[financial_cols].head())

# Display basic information about the cleaned dataset
print(f"\nDataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print("\nFull Data Types:")
print(df.dtypes)

# Recalculate derived metrics (e.g., ROA and Profit Margin) if needed
df['ROA'] = df['Profit'] / df['COGS']
df['Profit_Margin'] = df['Profit'] / df['Sales']

# Handle potential division by zero or infinity
df['ROA'] = df['ROA'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['Profit_Margin'] = df['Profit_Margin'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Display summary statistics for key metrics
print("\nSummary Statistics for Key Metrics:")
print(df[['Sales', 'COGS', 'Profit', 'ROA', 'Profit_Margin']].describe())

# Save the cleaned and processed data for further use
processed_data_path = '../data/processed/ready_for_analysis.csv'
df.to_csv(processed_data_path, index=False)
print(f"\nProcessed data saved to: {processed_data_path}")

In [None]:
# Recalculate ROA and Profit_Margin
df['Profit_Margin'] = df['Profit'] / df['Sales']
df['Profit_Margin'] = df['Profit_Margin'].replace([np.inf, -np.inf], np.nan).fillna(0)

df['ROA'] = df['Profit'] / df['COGS']
df['ROA'] = df['ROA'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Verify the new columns
print("Columns after adding ROA and Profit_Margin:")
print(df.columns)

## 2. Feature Selection for Clustering

In [None]:
# Select features for clustering
# Numeric features
numeric_features = ['Sales', 'COGS', 'Profit', 'ROA', 'Profit_Margin']

# Categorical feature
categorical_features = ['Segment']

# Check if all selected features exist in the dataframe
for feature in numeric_features + categorical_features:
    if feature not in df.columns:
        print(f"Warning: {feature} not found in the dataframe.")

# Create a copy of the data with selected features
clustering_data = df[numeric_features + categorical_features].copy()

# Display the first few rows
print("Clustering Data:")
print(clustering_data.head())

## 3. Preprocessing for Clustering

In [None]:
# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Sales', 'COGS', 'Profit', 'ROA', 'Profit_Margin']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Segment'])
    ]
)

# Apply preprocessing
X_preprocessed = preprocessor.fit_transform(clustering_data)

# Get feature names after one-hot encoding
ohe = preprocessor.named_transformers_['cat']
cat_feature_names = ohe.get_feature_names_out(['Segment'])  # Adjust if more categorical features are added
feature_names = ['Sales', 'COGS', 'Profit', 'ROA', 'Profit_Margin'] + list(cat_feature_names)

# Display preprocessing results
print(f"Shape after preprocessing: {X_preprocessed.shape}")
print(f"Number of features: {len(feature_names)}")

## 4. Apply PCA for Dimensionality Reduction

In [None]:
# Cell 4: Apply PCA for dimensionality reduction
from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_preprocessed)

print(f"Shape after PCA: {X_pca.shape}")
print(f"Number of components: {pca.n_components_}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.2f}")

# Visualize explained variance by components
plot_pca_components(pca, '../results/plots/ml/pca_explained_variance.png')

## 5. Determine Optimal Number of Clusters

In [None]:
max_clusters = min(15, X_pca.shape[0])  # Ensure max_clusters <= n_samples
output_path = '../results/plots/ml/kmeans_elbow_method.png'

plot_elbow_method(X_pca, max_clusters=max_clusters, random_state=42, output_path=output_path)
# Set the optimal number of clusters to 2 (based on prior analysis)
optimal_k = 2

print(f"Optimal number of clusters (default): {optimal_k}")

## 6. Apply K-Means Clustering

In [None]:
# Apply K-Means clustering with the default number of clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_pca)

# Add cluster labels to the original dataframe
df['Cluster'] = cluster_labels

# Display the count of data points in each cluster
print("Cluster distribution:")
print(df['Cluster'].value_counts())

In [None]:
# Visualize clusters in 2D space (using first two PCA components)
plot_clusters(X_pca, cluster_labels, '../results/plots/ml/kmeans_clusters.png')

## 7. Analyze Cluster Characteristics

In [None]:
# Analyze cluster characteristics
cluster_analysis = df.groupby('Cluster').agg({
    'Sales': 'mean',
    'COGS': 'mean',
    'Profit': 'mean',
    'ROA': 'mean',
    'Profit_Margin': 'mean',
    'Cluster': 'count'
}).rename(columns={'Cluster': 'Count'}).sort_values('Profit', ascending=False)

print("Cluster characteristics:")
cluster_analysis

In [None]:
# Visualize cluster characteristics
plt.figure(figsize=(14, 10))

# Plot metrics by cluster
metrics = ['Sales', 'COGS', 'Profit', 'ROA', 'Profit_Margin']
for i, metric in enumerate(metrics):
    plt.subplot(2, 3, i+1)
    sns.barplot(x='Cluster', y=metric, data=df)
    plt.title(f'Average {metric} by Cluster')
    plt.grid(axis='y')

plt.tight_layout()
plt.savefig('../results/plots/ml/cluster_characteristics.png')
plt.show()

In [None]:
# Analyze segment distribution across clusters
segment_cluster = pd.crosstab(df['Segment'], df['Cluster'], normalize='index') * 100

plt.figure(figsize=(14, 8))
segment_cluster.plot(kind='bar', stacked=True)
plt.title('Segment Distribution Across Clusters')
plt.xlabel('Segment')
plt.ylabel('Percentage (%)')
plt.legend(title='Cluster')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/ml/segment_cluster_distribution.png')
plt.show()

## 8. Assign Meaningful Cluster Labels

In [None]:
# Assign meaningful labels to clusters based on their characteristics
def assign_cluster_labels(cluster_analysis):
    # Sort clusters by profit (high to low)
    profit_order = cluster_analysis.sort_values('Profit', ascending=False).index.tolist()
    
    # Sort clusters by ROA (high to low)
    roa_order = cluster_analysis.sort_values('ROA', ascending=False).index.tolist()
    
    # Sort clusters by profit margin (high to low)
    margin_order = cluster_analysis.sort_values('Profit_Margin', ascending=False).index.tolist()
    
    # Create labels dictionary
    cluster_labels = {}
    
    for cluster in cluster_analysis.index:
        # Determine profit level
        profit_rank = profit_order.index(cluster)
        profit_level = "High Profit" if profit_rank < len(profit_order) / 3 else \
                      "Medium Profit" if profit_rank < 2 * len(profit_order) / 3 else "Low Profit"
        
        # Determine ROA level
        roa_rank = roa_order.index(cluster)
        roa_level = "High ROA" if roa_rank < len(roa_order) / 3 else \
                   "Medium ROA" if roa_rank < 2 * len(roa_order) / 3 else "Low ROA"
        
        # Combine labels
        cluster_labels[cluster] = f"{profit_level}/{roa_level}"
    
    return cluster_labels

# Get cluster labels
cluster_labels_dict = assign_cluster_labels(cluster_analysis)
print("Cluster labels:")
for cluster, label in cluster_labels_dict.items():
    print(f"Cluster {cluster}: {label}")

# Add descriptive labels to the dataframe
df['Cluster_Label'] = df['Cluster'].map(cluster_labels_dict)

# Display the count of data points in each labeled cluster
print("\nLabeled cluster distribution:")
print(df['Cluster_Label'].value_counts())

In [None]:
# Save the clustering model and preprocessor
clustering_model = {
    'preprocessor': preprocessor,
    'pca': pca,
    'kmeans': kmeans,
    'cluster_labels': cluster_labels_dict
}

with open('../results/models/clustering_model.pkl', 'wb') as f:
    pickle.dump(clustering_model, f)

print("Clustering model saved.")

## 9. Cluster Insights and Recommendations

In [None]:
# Summarize cluster characteristics with labels
labeled_cluster_analysis = df.groupby('Cluster_Label').agg({
    'Sales': 'mean',
    'COGS': 'mean',
    'Profit': 'mean',
    'ROA': 'mean',
    'Profit_Margin': 'mean',
    'Cluster_Label': 'count'
}).rename(columns={'Cluster_Label': 'Count'}).sort_values('Profit', ascending=False)

print("Labeled cluster characteristics:")
labeled_cluster_analysis

### Clustering Insights:

Based on the clustering analysis, we identified distinct financial performance clusters, each with unique characteristics. Key insights include:

- **Cluster Characteristics**:
  - **High Profit / High ROA Cluster**: Represents top-performing segments with exceptional profitability and return on assets.
  - **Low Profit / Low ROA Cluster**: Indicates areas that require immediate attention and strategic intervention.

- **Segment-Specific Performance**:
  - Certain segments show a strong presence in high-performing clusters, suggesting industry-specific advantages.

#### Recommendations Based on Clustering:
1. **Strategic Focus on High-Potential Segments**:
   - Allocate more resources to segments that appear frequently in the "High Profit / High ROA" cluster.

2. **Targeted Interventions for Underperforming Units**:
   - Develop specific strategies for entities in the "Low Profit / Low ROA" cluster, focusing on improving key metrics.

3. **Performance Benchmarking**:
   - Establish cluster-based benchmarks for different business units, recognizing that different segments may have different performance profiles.

4. **Segment-Specific Strategies**:
   - Develop tailored approaches for each segment based on their cluster distribution patterns.

5. **Cross-Segment Learning**:
   - Identify best practices from segments that consistently appear in high-performing clusters and apply them to underperforming segments.


## 10. Save Results for Further Analysis

In [None]:
# Save the dataframe with cluster labels for use in other notebooks
df.to_csv('../data/processed/clustered_data.csv', index=False)

# Save cluster analysis summary
labeled_cluster_analysis.to_csv('../results/reports/cluster_analysis_summary.csv')

print("Results saved for further analysis.")