# Product Performance Analysis
## Machine Learning Assignment

This notebook presents a comprehensive analysis of supermarket product sales data using:
- **K-means Clustering** (implemented from scratch)
- **Regression Models** (Linear and Polynomial)

---


## 1. Data Overview and Loading


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add source code to path
sys.path.append(os.path.dirname(os.path.abspath('')))

from preprocessing import *
from kmeans import *
from regression import *

# Set style for better visualizations
try:
    plt.style.use('seaborn-v0_8')
except OSError:
    try:
        plt.style.use('seaborn')
    except OSError:
        plt.style.use('default')
sns.set_palette("husl")

# Load data
df = load_data('../data/product_sales.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Basic dataset statistics
print("Dataset Information:")
print(f"Total products: {len(df)}")
print(f"\nFeatures:")
print(df.columns.tolist())
print(f"\nData types:")
print(df.dtypes)
print(f"\nBasic Statistics:")
df.describe()


## 2. Data Preprocessing

### 2.1 Missing Value Analysis


In [None]:
# Analyze missing values
missing_info = analyze_missing_values(df)
print("Missing Value Analysis:")
print(f"Total missing values: {missing_info['total_missing']}")
print(f"\nMissing values by column:")
print(missing_info['missing_by_column'])
print(f"\nMissing percentage by column:")
print(missing_info['missing_percentage'])


### 2.2 Handle Missing Values

**Strategy:**
- For missing product names: Fill with category-based placeholder
- For missing numerical values: Use mean imputation


In [None]:
# Handle missing values
df_cleaned = handle_missing_values(df, strategy='mean')
print(f"After handling missing values:")
print(f"Remaining missing values: {df_cleaned.isnull().sum().sum()}")


### 2.3 Outlier Detection and Normalization

**Outlier Strategy:** Cap outliers at IQR bounds (preserves data while handling extremes)

**Normalization:** Standardization (Z-score) - required for K-means clustering


In [None]:
# Detect and handle outliers
numerical_cols = ['price', 'cost', 'units_sold', 'promotion_frequency', 'shelf_level', 'profit']
outliers_iqr = detect_outliers_iqr(df_cleaned, numerical_cols)
df_processed = handle_outliers(df_cleaned, outliers_iqr, method='cap')

# Normalize features for clustering
clustering_features = ['price', 'cost', 'units_sold', 'promotion_frequency', 'shelf_level']
df_normalized, scaler = normalize_features(df_processed, clustering_features, method='standardize')

print("Preprocessing complete!")
print(f"Processed shape: {df_processed.shape}")


## 3. K-means Clustering Analysis

### 3.1 Elbow Method for Optimal K


In [None]:
# Prepare data for clustering
X_cluster = df_normalized[clustering_features].values

# Run elbow method
k_range = range(2, 9)
elbow_results = elbow_method(X_cluster, k_range, max_iters=100, random_state=42)
k_values = elbow_results['k_values']
wcss_values = elbow_results['wcss_values']

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_values, wcss_values, marker='o', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (k)', fontsize=12, fontweight='bold')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)', fontsize=12, fontweight='bold')
plt.title('Elbow Method for Optimal K', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.xticks(k_values)

# Determine optimal k (find largest decrease)
decreases = [wcss_values[i] - wcss_values[i+1] for i in range(len(wcss_values)-1)]
optimal_k = k_values[decreases.index(max(decreases)) + 1] if len(decreases) > 0 else 4
elbow_idx = k_values.index(optimal_k)
plt.plot(optimal_k, wcss_values[elbow_idx], 'ro', markersize=12, label=f'Optimal k={optimal_k}')
plt.legend()
plt.tight_layout()
plt.show()

print(f"Optimal k: {optimal_k}")


### 3.2 K-means Clustering and Analysis


In [None]:
# Run K-means with optimal k
labels, centroids, final_wcss, iterations = kmeans(X_cluster, optimal_k, 
                                                    max_iters=100, 
                                                    init_method='kmeans++',
                                                    random_state=42)
df_processed['cluster'] = labels

# Calculate cluster statistics
cluster_stats = []
for cluster_id in range(optimal_k):
    cluster_data = df_processed[df_processed['cluster'] == cluster_id]
    stats = {
        'Cluster': cluster_id,
        'Count': len(cluster_data),
        'Avg Price': cluster_data['price'].mean(),
        'Avg Units Sold': cluster_data['units_sold'].mean(),
        'Avg Profit': cluster_data['profit'].mean(),
        'Avg Promotion Frequency': cluster_data['promotion_frequency'].mean()
    }
    cluster_stats.append(stats)

cluster_df = pd.DataFrame(cluster_stats)
print("Cluster Statistics:")
print(cluster_df.round(2))


### 3.3 Cluster Visualization


In [None]:
# Create cluster scatter plot
fig, ax = plt.subplots(figsize=(12, 8))
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8', '#F7DC6F', '#BB8FCE']

for cluster_id in range(optimal_k):
    cluster_data = df_processed[df_processed['cluster'] == cluster_id]
    ax.scatter(cluster_data['price'], cluster_data['units_sold'], 
              c=colors[cluster_id], label=f"Cluster {cluster_id}",
              s=100, alpha=0.6, edgecolors='black', linewidth=1)

# Mark centroids
centroids_original = scaler.inverse_transform(centroids)
centroid_prices = centroids_original[:, clustering_features.index('price')]
centroid_units = centroids_original[:, clustering_features.index('units_sold')]
ax.scatter(centroid_prices, centroid_units, c='red', marker='X', s=300, 
          label='Centroids', edgecolors='black', linewidth=2, zorder=10)

ax.set_xlabel('Price ($)', fontsize=12, fontweight='bold')
ax.set_ylabel('Units Sold', fontsize=12, fontweight='bold')
ax.set_title('K-means Clustering Results: Price vs Units Sold', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

# Prepare data
feature_cols = ['price', 'cost', 'units_sold', 'promotion_frequency', 'shelf_level']
target_col = 'profit'
X_reg, y_reg = prepare_regression_data(df_processed, feature_cols, target_col)
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

# Train models
linear_model = train_linear_regression(X_train, y_train)
linear_pred_test = linear_model.predict(X_test)

poly_model, poly_transformer = train_polynomial_regression(X_train, y_train, degree=2)
poly_pred_test = poly_model.predict(poly_transformer.transform(X_test))

# Evaluate
linear_metrics = evaluate_model(y_test, linear_pred_test)
poly_metrics = evaluate_model(y_test, poly_pred_test)

print("Model Performance:")
print(f"Linear Regression - R²: {linear_metrics['R2']:.3f}, RMSE: {linear_metrics['RMSE']:.2f}")
print(f"Polynomial Regression - R²: {poly_metrics['R2']:.3f}, RMSE: {poly_metrics['RMSE']:.2f}")


### 4.2 Regression Visualizations


## 5. Summary and Key Findings


In [None]:
print("="*80)
print("KEY FINDINGS SUMMARY")
print("="*80)

print("\n1. DATA PREPROCESSING:")
print(f"   - Handled missing values using appropriate strategies")
print(f"   - Detected and capped outliers using IQR method")
print(f"   - Standardized features for K-means clustering")

print("\n2. CLUSTERING ANALYSIS:")
print(f"   - Optimal number of clusters: {optimal_k}")
print(f"   - Identified {optimal_k} distinct product groups")
for idx, row in cluster_df.iterrows():
    print(f"     • Cluster {int(row['Cluster'])}: {int(row['Count'])} products")

print("\n3. REGRESSION ANALYSIS:")
best_model = "Linear Regression" if linear_metrics['RMSE'] < poly_metrics['RMSE'] else "Polynomial Regression"
best_metrics = linear_metrics if best_model == "Linear Regression" else poly_metrics
print(f"   - Best model: {best_model}")
print(f"   - Test R²: {best_metrics['R2']:.3f}")
print(f"   - Test RMSE: ${best_metrics['RMSE']:.2f}")

print("\n" + "="*80)
print("Analysis complete!")
print("="*80)


In [None]:
# Actual vs Predicted plots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Linear Regression
axes[0].scatter(y_test, linear_pred_test, alpha=0.6, s=80, edgecolors='black', linewidth=0.5)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
            'r--', lw=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual Profit ($)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Predicted Profit ($)', fontsize=12, fontweight='bold')
axes[0].set_title(f'Linear Regression\nR² = {linear_metrics["R2"]:.3f}, RMSE = {linear_metrics["RMSE"]:.2f}', 
                 fontsize=12, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Polynomial Regression
axes[1].scatter(y_test, poly_pred_test, alpha=0.6, s=80, edgecolors='black', linewidth=0.5, color='green')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
            'r--', lw=2, label='Perfect Prediction')
axes[1].set_xlabel('Actual Profit ($)', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Predicted Profit ($)', fontsize=12, fontweight='bold')
axes[1].set_title(f'Polynomial Regression\nR² = {poly_metrics["R2"]:.3f}, RMSE = {poly_metrics["RMSE"]:.2f}', 
                 fontsize=12, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.suptitle('Actual vs Predicted Profit Comparison', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()
