In [None]:
from pandas import read_csv
import pandas as pd

# Specify the encoding as detected ('ascii')
train_data = pd.read_csv('/content/srsstat_train_data.csv', encoding='ascii')  # Updated encoding to 'ascii'

test_data = pd.read_csv('/content/srsstat_test_data.csv', encoding='ascii')  # Updated encoding to 'ascii'

In [None]:
train_data.value_counts('cluster')

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
1,7202
0,6913
2,6885


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 87 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customer_id                21000 non-null  int64  
 1   age                        21000 non-null  int64  
 2   gender                     21000 non-null  object 
 3   income_bracket             21000 non-null  object 
 4   loyalty_program            21000 non-null  object 
 5   membership_years           21000 non-null  int64  
 6   churned                    21000 non-null  object 
 7   marital_status             21000 non-null  object 
 8   number_of_children         21000 non-null  int64  
 9   education_level            21000 non-null  object 
 10  occupation                 21000 non-null  object 
 11  transaction_id             21000 non-null  int64  
 12  product_id                 21000 non-null  int64  
 13  product_category           21000 non-null  obj

# Decision Tree - Without Balancing - All Features

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score  # Added cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Verify train_data and test_data exist from previous steps
if 'train_data' not in globals() or 'test_data' not in globals():
    raise ValueError("train_data or test_data not found! Ensure previous steps are executed.")

# Define the attributes
selected_features = [
    # Categorical attributes
    'last_purchase_month', 'promotion_end_month', 'product_manufacture_month',
    'month_of_year', 'product_expiry_date_year', 'product_manufacture_year',
    'transaction_year', 'product_expiry_date_month', 'transaction_month',
    'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week',
    'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
    'email_subscriptions', 'store_location', 'high_value_quantity',
    # Numeric attributes
    'customer_support_calls', 'product_review_count', 'days_since_last_purchase',
    'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions',
    'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value',
    'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life',
    'total_returned_items', 'transaction_hour', 'min_single_purchase_value',
    'number_of_children', 'product_stock', 'avg_purchase_value',
    'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_value',
    'avg_spent_per_category', 'total_discounts_received', 'product_return_rate',
    'avg_transaction_value', 'in_store_purchases'
]

# Filter features that exist in train_data
selected_features = [col for col in selected_features if col in train_data.columns]
print("Selected features for Decision Tree:", selected_features)

# Define features (X) and target (y) for train and test sets
X_train = train_data[selected_features]
y_train = train_data['cluster']
X_test = test_data[selected_features]
y_test = test_data['cluster']

# Identify categorical and numeric columns among selected features
categorical_cols = [col for col in selected_features if col in ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year',
                                                                'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase',
                                                                'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
                                                                'email_subscriptions', 'store_location', 'high_value_quantity']]
numeric_cols = [col for col in selected_features if col not in categorical_cols]

# Preprocessing pipeline for encoding and scaling
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# Create pipeline with preprocessing and Decision Tree
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__max_depth': [None, 10, 15, 20, 30],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 4, 8],
    'classifier__criterion': ['gini']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train Decision Tree with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions on test set
y_pred = best_model.predict(X_test)

# Get unique cluster labels for classification report
unique_clusters = sorted(y_train.unique())
target_names = [f'Cluster {i}' for i in unique_clusters]

# Classification Report and Metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Accuracy, Recall, Precision
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

# Calculate Specificity for each cluster
def calculate_specificity(class_idx):
    # True Negatives for the current class
    tn = cm.sum() - cm[:, class_idx].sum() - cm[class_idx, :].sum() + cm[class_idx, class_idx]
    # False Positives for the current class
    fp = cm[:, class_idx].sum() - cm[class_idx, class_idx]
    # Specificity = TN / (TN + FP)
    return tn / (tn + fp) if (tn + fp) > 0 else 0

# Calculate specificity for each cluster
specificity_scores = {f'Cluster {i}': calculate_specificity(idx) for idx, i in enumerate(unique_clusters)}

# Print metrics
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
for cluster, spec in specificity_scores.items():
    print(f"Specificity for {cluster}: {spec:.4f}")

print("-----------------------------------------------")

# --- Cross-Validation on Full Dataset ---
# Combine train and test data for cross-validation
X_full = pd.concat([X_train, X_test], axis=0)
y_full = pd.concat([y_train, y_test], axis=0)

# Perform 5-fold cross-validation using the best model
cv_scores = cross_val_score(best_model, X_full, y_full, cv=5, scoring='accuracy', n_jobs=-1)
print(f"\nCross-Validation Results (5-fold):")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
print(f"Individual Fold Scores: {cv_scores}")


Selected features for Decision Tree: ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year', 'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state', 'email_subscriptions', 'store_location', 'high_value_quantity', 'customer_support_calls', 'product_review_count', 'days_since_last_purchase', 'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions', 'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value', 'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life', 'total_returned_items', 'transaction_hour', 'min_single_purchase_value', 'number_of_children', 'product_stock', 'avg_purchase_value', 'avg_items_per_transaction', 'website_visits', 'age', 'max_single_pur

# Decision Tree - Without Balancing - Selected Features

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

# Verify train_data and test_data exist from previous steps
if 'train_data' not in globals() or 'test_data' not in globals():
    raise ValueError("train_data or test_data not found! Ensure previous steps are executed.")

# Define the attributes as specified
selected_features = [
    # Categorical attributes
    'last_purchase_month', 'promotion_end_month', 'product_manufacture_month',
    'month_of_year', 'product_expiry_date_year', 'product_manufacture_year',
    'transaction_year', 'product_expiry_date_month', 'transaction_month',
    'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week',
    'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
    'email_subscriptions', 'store_location', 'high_value_quantity',
    # Numeric attributes
    'customer_support_calls', 'product_review_count', 'days_since_last_purchase',
    'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions',
    'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value',
    'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life',
    'total_returned_items', 'transaction_hour', 'min_single_purchase_value',
    'number_of_children', 'product_stock', 'avg_purchase_value',
    'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_value',
    'avg_spent_per_category', 'total_discounts_received', 'product_return_rate',
    'avg_transaction_value', 'in_store_purchases'
]

# Filter features that exist in train_data
selected_features = [col for col in selected_features if col in train_data.columns]
print("Initial selected features:", selected_features)

# Define features (X) and target (y) for train and test sets
X_train = train_data[selected_features]
y_train = train_data['cluster']
X_test = test_data[selected_features]
y_test = test_data['cluster']

# Identify categorical and numeric columns among selected features
categorical_cols = [col for col in selected_features if col in ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year',
                                                                'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase',
                                                                'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
                                                                'email_subscriptions', 'store_location', 'high_value_quantity']]
numeric_cols = [col for col in selected_features if col not in categorical_cols]


# Preprocessing pipeline for encoding and scaling
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# Preprocess the data before RFECV to avoid string-to-float issues
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Get transformed feature names
feature_names = preprocessor.get_feature_names_out()

# Initialize RFECV with a simple Decision Tree (no preprocessing needed)
rfecv = RFECV(estimator=DecisionTreeClassifier(random_state=42), step=1, cv=3, scoring='accuracy', n_jobs=-1)
rfecv.fit(X_train_transformed, y_train)

# Get selected features from transformed space
selected_mask = rfecv.support_
selected_transformed_features = feature_names[selected_mask].tolist()
print(f"Number of Features Selected by RFECV: {len(selected_transformed_features)}")
print(f"Selected Transformed Features: {selected_transformed_features}")

# Map transformed features back to original features
# Since one-hot encoding creates multiple columns per categorical feature, we extract unique original feature names
original_selected_features = []
for transformed_feature in selected_transformed_features:
    # Remove prefixes like 'num__' or 'cat__'
    if transformed_feature.startswith('num__'):
        original_feature = transformed_feature.replace('num__', '')
    elif transformed_feature.startswith('cat__'):
        # Extract the original feature name before the category (e.g., 'promotion_effectiveness_Low' -> 'promotion_effectiveness')
        original_feature = transformed_feature.replace('cat__', '').split('_')[0]
    else:
        original_feature = transformed_feature
    if original_feature in selected_features and original_feature not in original_selected_features:
        original_selected_features.append(original_feature)

print(f"Selected Original Features: {original_selected_features}")

# Use selected original features for training and testing
X_train_rfe = X_train[original_selected_features]
X_test_rfe = X_test[original_selected_features]

# Update categorical and numeric columns for the selected features
categorical_cols_rfe = [col for col in original_selected_features if col in categorical_cols]
numeric_cols_rfe = [col for col in original_selected_features if col not in categorical_cols]

# Update preprocessing pipeline for selected features
preprocessor_rfe = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols_rfe),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols_rfe)
])

# Create pipeline with preprocessing and Decision Tree
pipeline_rfe = Pipeline([
    ('preprocessor', preprocessor_rfe),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__max_depth': [None, 10, 15, 20, 30],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 4, 8],
    'classifier__criterion': ['gini']
}

grid_search = GridSearchCV(pipeline_rfe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_rfe, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train Decision Tree with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_rfe, y_train)

# Predictions on test set
y_pred = best_model.predict(X_test_rfe)

# Get unique cluster labels for classification report
unique_clusters = sorted(y_train.unique())
target_names = [f'Cluster {i}' for i in unique_clusters]

# Classification Report and Metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Accuracy, Recall, Precision
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

# Calculate Specificity for each cluster
def calculate_specificity(class_idx):
    # True Negatives for the current class
    tn = cm.sum() - cm[:, class_idx].sum() - cm[class_idx, :].sum() + cm[class_idx, class_idx]
    # False Positives for the current class
    fp = cm[:, class_idx].sum() - cm[class_idx, class_idx]
    # Specificity = TN / (TN + FP)
    return tn / (tn + fp) if (tn + fp) > 0 else 0

# Calculate specificity for each cluster
specificity_scores = {f'Cluster {i}': calculate_specificity(idx) for idx, i in enumerate(unique_clusters)}

# Print metrics
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
for cluster, spec in specificity_scores.items():
    print(f"Specificity for {cluster}: {spec:.4f}")

print("-----------------------------------------------")

# --- Cross-Validation on Full Dataset ---
# Combine train and test data for cross-validation
X_full = pd.concat([X_train_rfe, X_test_rfe], axis=0)
y_full = pd.concat([y_train, y_test], axis=0)

# Perform 5-fold cross-validation using the best model
cv_scores = cross_val_score(best_model, X_full, y_full, cv=5, scoring='accuracy', n_jobs=-1)
print(f"\nCross-Validation Results (5-fold):")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
print(f"Individual Fold Scores: {cv_scores}")

Initial selected features: ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year', 'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state', 'email_subscriptions', 'store_location', 'high_value_quantity', 'customer_support_calls', 'product_review_count', 'days_since_last_purchase', 'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions', 'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value', 'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life', 'total_returned_items', 'transaction_hour', 'min_single_purchase_value', 'number_of_children', 'product_stock', 'avg_purchase_value', 'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_valu

# Decision Tree - With Balancing - All Features

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline as SKPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from collections import Counter
import pandas as pd
import numpy as np
import optuna

# Verify train_data and test_data exist
if 'train_data' not in globals() or 'test_data' not in globals():
    raise ValueError("train_data or test_data not found! Ensure they are loaded.")

# Define the attributes as specified
selected_features = [
    # Categorical attributes
    'last_purchase_month', 'promotion_end_month', 'product_manufacture_month',
    'month_of_year', 'product_expiry_date_year', 'product_manufacture_year',
    'transaction_year', 'product_expiry_date_month', 'transaction_month',
    'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week',
    'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
    'email_subscriptions', 'store_location', 'high_value_quantity',
    # Numeric attributes
    'customer_support_calls', 'product_review_count', 'days_since_last_purchase',
    'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions',
    'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value',
    'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life',
    'total_returned_items', 'transaction_hour', 'min_single_purchase_value',
    'number_of_children', 'product_stock', 'avg_purchase_value',
    'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_value',
    'avg_spent_per_category', 'total_discounts_received', 'product_return_rate',
    'avg_transaction_value', 'in_store_purchases'
]

# Filter features that exist in train_data
selected_features = [col for col in selected_features if col in train_data.columns]
print("Initial selected features:", selected_features)

# Use pre-existing train_data and test_data
X_train = train_data[selected_features]
y_train = train_data['cluster']
X_test = test_data[selected_features]
y_test = test_data['cluster']

# Print class distribution before balancing
print("Class Distribution Before Balancing (train_data):", Counter(y_train))
print("Class Distribution (test_data):", Counter(y_test))

# Identify categorical and numeric columns among selected features
categorical_cols = [col for col in selected_features if col in [
    'last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year',
    'product_expiry_date_year', 'product_manufacture_year', 'transaction_year',
    'product_expiry_date_month', 'transaction_month', 'high_value_purchase', 'week_of_year',
    'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender',
    'weekend', 'store_state', 'email_subscriptions', 'store_location', 'high_value_quantity'
]]
numeric_cols = [col for col in selected_features if col not in categorical_cols]

# Preprocessing pipeline for encoding and scaling
preprocessor = ColumnTransformer(transformers=[
    ('num', SKPipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    ('cat', SKPipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# Define balancing strategy: target max feasible samples per class
class_counts = Counter(y_train)
target_samples = min(max(class_counts.values()), 10000)  # Use max class size or 10,000
sampling_strategy_under = {k: target_samples for k in class_counts if class_counts[k] > target_samples}
sampling_strategy_over = {k: target_samples for k in class_counts}

# Print sampling strategies
print("Undersampling Strategy:", sampling_strategy_under)
print("Oversampling Strategy:", sampling_strategy_over)

# Create a pipeline for preprocessing, undersampling, oversampling, and classification
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('undersample', RandomUnderSampler(sampling_strategy=sampling_strategy_under, random_state=42)),
    ('oversample', SMOTE(sampling_strategy=sampling_strategy_over, random_state=42)),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define objective function for Optuna
def objective(trial):
    # Define hyperparameter search space
    params = {
        'classifier__max_depth': trial.suggest_categorical('classifier__max_depth', [None, 10, 15, 20, 30]),
        'classifier__min_samples_split': trial.suggest_int('classifier__min_samples_split', 2, 15),
        'classifier__min_samples_leaf': trial.suggest_int('classifier__min_samples_leaf', 1, 8),
        'classifier__criterion': trial.suggest_categorical('classifier__criterion', ['gini'])
    }

    # Set parameters in the pipeline
    pipeline.set_params(**params)

    # Perform cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

    # Return mean accuracy as the objective to maximize
    return scores.mean()

# Create Optuna study and optimize
study = optuna.create_study(direction='maximize')
try:
    study.optimize(objective, n_trials=50, timeout=600)  # Run 50 trials or 10 minutes
except Exception as e:
    print(f"Optuna optimization failed: {e}")
    print("Please check the class distribution and adjust sampling_strategy if necessary.")
    raise

# Best hyperparameters
best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")

# Set best parameters in the pipeline
pipeline.set_params(**best_params)

# Train the model with the best parameters
try:
    pipeline.fit(X_train, y_train)
except ValueError as e:
    print(f"Pipeline fitting failed: {e}")
    print("Please check the class distribution and adjust sampling_strategy if necessary.")
    raise

# Check dataset distribution after balancing
X_train_resampled, y_train_resampled = pipeline.named_steps['undersample'].fit_resample(
    pipeline.named_steps['preprocessor'].fit_transform(X_train), y_train
)
X_train_resampled, y_train_resampled = pipeline.named_steps['oversample'].fit_resample(X_train_resampled, y_train_resampled)
print("After Balancing (Training Data):", Counter(y_train_resampled))

# Get the trained model
best_model = pipeline

# Predictions on test set
y_pred = best_model.predict(X_test)

# Get unique cluster labels for classification report
unique_clusters = sorted(np.unique(y_train))
target_names = [f'Cluster {i}' for i in unique_clusters]

# Classification Report and Metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Accuracy, Recall, Precision
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

# Calculate Specificity for each cluster
def calculate_specificity(class_idx):
    tn = cm.sum() - cm[:, class_idx].sum() - cm[class_idx, :].sum() + cm[class_idx, class_idx]
    fp = cm[:, class_idx].sum() - cm[class_idx, class_idx]
    return tn / (tn + fp) if (tn + fp) > 0 else 0

# Calculate specificity for each cluster
specificity_scores = {f'Cluster {i}': calculate_specificity(idx) for idx, i in enumerate(unique_clusters)}

# Print metrics
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
for cluster, spec in specificity_scores.items():
    print(f"Specificity for {cluster}: {spec:.4f}")

print("-----------------------------------------------")

# --- Cross-Validation on Training Data ---
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print(f"\nCross-Validation Results (5-fold) on Training Data:")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
print(f"Individual Fold Scores: {cv_scores}")

[I 2025-07-22 18:29:50,169] A new study created in memory with name: no-name-8efab890-8cc3-4d43-bb9e-8ce41499d1a5


Initial selected features: ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year', 'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state', 'email_subscriptions', 'store_location', 'high_value_quantity', 'customer_support_calls', 'product_review_count', 'days_since_last_purchase', 'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions', 'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value', 'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life', 'total_returned_items', 'transaction_hour', 'min_single_purchase_value', 'number_of_children', 'product_stock', 'avg_purchase_value', 'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_valu

[I 2025-07-22 18:30:18,197] Trial 0 finished with value: 0.49485714285714283 and parameters: {'classifier__max_depth': 20, 'classifier__min_samples_split': 9, 'classifier__min_samples_leaf': 1, 'classifier__criterion': 'gini'}. Best is trial 0 with value: 0.49485714285714283.
[I 2025-07-22 18:30:32,030] Trial 1 finished with value: 0.5327619047619048 and parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 12, 'classifier__min_samples_leaf': 1, 'classifier__criterion': 'gini'}. Best is trial 1 with value: 0.5327619047619048.
[I 2025-07-22 18:30:47,588] Trial 2 finished with value: 0.5086666666666668 and parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 8, 'classifier__min_samples_leaf': 5, 'classifier__criterion': 'gini'}. Best is trial 1 with value: 0.5327619047619048.
[I 2025-07-22 18:31:01,334] Trial 3 finished with value: 0.5327142857142857 and parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 3, 'classifier_

Best Hyperparameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 8, 'classifier__criterion': 'gini'}
After Balancing (Training Data): Counter({0: 7202, 1: 7202, 2: 7202})

Classification Report:
              precision    recall  f1-score   support

   Cluster 0       0.54      0.55      0.54      3019
   Cluster 1       0.55      0.54      0.54      3055
   Cluster 2       0.54      0.54      0.54      2926

    accuracy                           0.54      9000
   macro avg       0.54      0.54      0.54      9000
weighted avg       0.54      0.54      0.54      9000


Confusion Matrix:
[[1661  693  665]
 [ 752 1640  663]
 [ 690  649 1587]]

Accuracy: 0.5431
Recall: 0.5431
Precision: 0.5432
Specificity for Cluster 0: 0.7589
Specificity for Cluster 1: 0.7743
Specificity for Cluster 2: 0.7814
-----------------------------------------------

Cross-Validation Results (5-fold) on Training Data:
Mean Accuracy: 0.5343
Standard Deviation:

# Decision Tree - With Balancing - Selected Features

In [None]:
!pip install optuna
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline as SKPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from collections import Counter
import pandas as pd
import numpy as np
import optuna

# Verify train_data and test_data exist
if 'train_data' not in globals() or 'test_data' not in globals():
    raise ValueError("train_data or test_data not found! Ensure they are loaded.")

# Define the attributes as specified
selected_features = [
    # Categorical attributes
    'last_purchase_month', 'promotion_end_month', 'product_manufacture_month',
    'month_of_year', 'product_expiry_date_year', 'product_manufacture_year',
    'transaction_year', 'product_expiry_date_month', 'transaction_month',
    'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week',
    'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
    'email_subscriptions', 'store_location', 'high_value_quantity',
    # Numeric attributes
    'customer_support_calls', 'product_review_count', 'days_since_last_purchase',
    'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions',
    'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value',
    'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life',
    'total_returned_items', 'transaction_hour', 'min_single_purchase_value',
    'number_of_children', 'product_stock', 'avg_purchase_value',
    'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_value',
    'avg_spent_per_category', 'total_discounts_received', 'product_return_rate',
    'avg_transaction_value', 'in_store_purchases'
]

# Filter features that exist in train_data
selected_features = [col for col in selected_features if col in train_data.columns]
print("Initial selected features:", selected_features)

# Use pre-existing train_data and test_data (split as in Code 1)
X_train = train_data[selected_features]
y_train = train_data['cluster']
X_test = test_data[selected_features]
y_test = test_data['cluster']

# Print class distribution before balancing
print("Class Distribution Before Balancing (train_data):", Counter(y_train))
print("Class Distribution (test_data):", Counter(y_test))

# Identify categorical and numeric columns among selected features
categorical_cols = [col for col in selected_features if col in [
    'last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year',
    'product_expiry_date_year', 'product_manufacture_year', 'transaction_year',
    'product_expiry_date_month', 'transaction_month', 'high_value_purchase', 'week_of_year',
    'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender',
    'weekend', 'store_state', 'email_subscriptions', 'store_location', 'high_value_quantity'
]]
numeric_cols = [col for col in selected_features if col not in categorical_cols]

# Preprocessing pipeline for encoding and scaling
preprocessor = ColumnTransformer(transformers=[
    ('num', SKPipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    ('cat', SKPipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# Define balancing strategy: target max feasible samples per class (from Code 1)
class_counts = Counter(y_train)
target_samples = min(max(class_counts.values()), 10000)  # Use max class size or 10,000
sampling_strategy_under = {k: target_samples for k in class_counts if class_counts[k] > target_samples}
sampling_strategy_over = {k: target_samples for k in class_counts}

# Print sampling strategies
print("Undersampling Strategy:", sampling_strategy_under)
print("Oversampling Strategy:", sampling_strategy_over)

# Create a pipeline for preprocessing, undersampling, oversampling
balancing_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('undersample', RandomUnderSampler(sampling_strategy=sampling_strategy_under, random_state=42)),
    ('oversample', SMOTE(sampling_strategy=sampling_strategy_over, random_state=42))
])

# Apply balancing to training data
X_train_resampled, y_train_resampled = balancing_pipeline.fit_resample(X_train, y_train)
print("After Balancing (Training Data):", Counter(y_train_resampled))

# Get feature names after preprocessing
feature_names = balancing_pipeline.named_steps['preprocessor'].get_feature_names_out()

# Train a preliminary Decision Tree to get feature importance
preliminary_model = DecisionTreeClassifier(random_state=42)
preliminary_model.fit(X_train_resampled, y_train_resampled)
feature_importances = preliminary_model.feature_importances_

# Define objective function for Optuna
def objective(trial):
    # Define hyperparameter search space
    params = {
        'max_depth': trial.suggest_categorical('max_depth', [None, 10, 15, 20, 30]),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 8),
        'criterion': trial.suggest_categorical('criterion', ['gini']),
        'n_features': trial.suggest_int('n_features', 1, len(feature_names))
    }

    # Select top features based on importance
    sorted_indices = np.argsort(feature_importances)[::-1]
    selected_indices = sorted_indices[:params['n_features']]
    X_train_rfe = X_train_resampled[:, selected_indices]

    # Train Decision Tree with suggested hyperparameters
    classifier = DecisionTreeClassifier(
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        criterion=params['criterion'],
        random_state=42
    )

    # Perform cross-validation
    scores = cross_val_score(classifier, X_train_rfe, y_train_resampled, cv=5, scoring='accuracy', n_jobs=-1)

    # Store selected indices for later use
    trial.set_user_attr('selected_indices', selected_indices.tolist())

    # Return mean accuracy as the objective to maximize
    return scores.mean()

# Create Optuna study and optimize
study = optuna.create_study(direction='maximize')
try:
    study.optimize(objective, n_trials=50, timeout=600)  # Run 50 trials or 10 minutes
except Exception as e:
    print(f"Optuna optimization failed: {e}")
    print("Please check the class distribution and adjust sampling_strategy if necessary.")
    raise

# Best hyperparameters and selected features
best_params = study.best_params
best_trial = study.best_trial
selected_indices = best_trial.user_attrs['selected_indices']
selected_transformed_features = [feature_names[i] for i in selected_indices]
print(f"Best Hyperparameters: {best_params}")
print(f"Number of Features Selected: {len(selected_transformed_features)}")
print(f"Selected Transformed Features: {selected_transformed_features}")

# Map transformed features back to original features
original_selected_features = []
for transformed_feature in selected_transformed_features:
    if transformed_feature.startswith('num__'):
        original_feature = transformed_feature.replace('num__', '')
    elif transformed_feature.startswith('cat__'):
        original_feature = transformed_feature.replace('cat__', '').split('_')[0]
    else:
        original_feature = transformed_feature
    if original_feature in selected_features and original_feature not in original_selected_features:
        original_selected_features.append(original_feature)
print(f"Selected Original Features: {original_selected_features}")

# Train final model with best hyperparameters and selected features
best_model = DecisionTreeClassifier(
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    criterion=best_params['criterion'],
    random_state=42
)
X_train_rfe = X_train_resampled[:, selected_indices]
best_model.fit(X_train_rfe, y_train_resampled)

# Transform test data and select features
X_test_transformed = preprocessor.transform(X_test)
X_test_rfe = X_test_transformed[:, selected_indices]

# Predictions on test set
y_pred = best_model.predict(X_test_rfe)

# Get unique cluster labels for classification report
unique_clusters = sorted(np.unique(y_train))
target_names = [f'Cluster {i}' for i in unique_clusters]

# Classification Report and Metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names, zero_division=0))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Accuracy, Recall, Precision
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)

# Calculate Specificity for each cluster
def calculate_specificity(class_idx):
    tn = cm.sum() - cm[:, class_idx].sum() - cm[class_idx, :].sum() + cm[class_idx, class_idx]
    fp = cm[:, class_idx].sum() - cm[class_idx, class_idx]
    return tn / (tn + fp) if (tn + fp) > 0 else 0

# Calculate specificity for each cluster
specificity_scores = {f'Cluster {i}': calculate_specificity(idx) for idx, i in enumerate(unique_clusters)}

# Print metrics
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
for cluster, spec in specificity_scores.items():
    print(f"Specificity for {cluster}: {spec:.4f}")

print("-----------------------------------------------")

# --- Cross-Validation on Training Data ---
# Perform 5-fold cross-validation using the best model
cv_scores = cross_val_score(best_model, X_train_rfe, y_train_resampled, cv=5, scoring='accuracy', n_jobs=-1)
print(f"\nCross-Validation Results (5-fold) on Training Data:")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
print(f"Individual Fold Scores: {cv_scores}")

Initial selected features: ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year', 'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state', 'email_subscriptions', 'store_location', 'high_value_quantity', 'customer_support_calls', 'product_review_count', 'days_since_last_purchase', 'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions', 'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value', 'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life', 'total_returned_items', 'transaction_hour', 'min_single_purchase_value', 'number_of_children', 'product_stock', 'avg_purchase_value', 'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_valu

[I 2025-07-22 19:57:35,707] A new study created in memory with name: no-name-d4258316-3ce6-4352-b200-4b0b17e7275b
[I 2025-07-22 19:57:42,397] Trial 0 finished with value: 0.5038415426699627 and parameters: {'max_depth': 20, 'min_samples_split': 14, 'min_samples_leaf': 1, 'criterion': 'gini', 'n_features': 127}. Best is trial 0 with value: 0.5038415426699627.
[I 2025-07-22 19:57:45,422] Trial 1 finished with value: 0.5121727975072183 and parameters: {'max_depth': 20, 'min_samples_split': 13, 'min_samples_leaf': 3, 'criterion': 'gini', 'n_features': 77}. Best is trial 1 with value: 0.5121727975072183.
[I 2025-07-22 19:57:47,880] Trial 2 finished with value: 0.537397047511047 and parameters: {'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 7, 'criterion': 'gini', 'n_features': 98}. Best is trial 2 with value: 0.537397047511047.
[I 2025-07-22 19:57:48,690] Trial 3 finished with value: 0.47574726530066724 and parameters: {'max_depth': None, 'min_samples_split': 4, 'min_samples_

Best Hyperparameters: {'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 7, 'criterion': 'gini', 'n_features': 21}
Number of Features Selected: 21
Selected Transformed Features: ['num__product_review_count', 'num__days_since_last_purchase', 'num__distance_to_store', 'num__online_purchases', 'num__customer_support_calls', 'num__unit_price', 'num__product_weight', 'num__total_items_purchased', 'num__product_rating', 'num__total_returned_value', 'num__total_transactions', 'num__product_shelf_life', 'num__discount_applied', 'num__avg_items_per_transaction', 'num__avg_discount_used', 'num__min_single_purchase_value', 'num__avg_spent_per_category', 'num__product_stock', 'num__transaction_hour', 'num__membership_years', 'num__avg_purchase_value']
Selected Original Features: ['product_review_count', 'days_since_last_purchase', 'distance_to_store', 'online_purchases', 'customer_support_calls', 'unit_price', 'product_weight', 'total_items_purchased', 'product_rating', 'total_returned_