In [None]:
from pandas import read_csv
import pandas as pd

# Specify the encoding as detected ('ascii')
train_data = pd.read_csv('/content/srsstat_train_data.csv', encoding='ascii')  # Updated encoding to 'ascii'

test_data = pd.read_csv('/content/srsstat_test_data.csv', encoding='ascii')  # Updated encoding to 'ascii'

In [None]:
train_data.value_counts('cluster')

Unnamed: 0_level_0,count
cluster,Unnamed: 1_level_1
1,7202
0,6913
2,6885


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 87 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customer_id                21000 non-null  int64  
 1   age                        21000 non-null  int64  
 2   gender                     21000 non-null  object 
 3   income_bracket             21000 non-null  object 
 4   loyalty_program            21000 non-null  object 
 5   membership_years           21000 non-null  int64  
 6   churned                    21000 non-null  object 
 7   marital_status             21000 non-null  object 
 8   number_of_children         21000 non-null  int64  
 9   education_level            21000 non-null  object 
 10  occupation                 21000 non-null  object 
 11  transaction_id             21000 non-null  int64  
 12  product_id                 21000 non-null  int64  
 13  product_category           21000 non-null  obj

# Decision Tree - Without Balancing - All Features

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score  # Added cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Verify train_data and test_data exist from previous steps
if 'train_data' not in globals() or 'test_data' not in globals():
    raise ValueError("train_data or test_data not found! Ensure previous steps are executed.")

# Define the attributes
selected_features = [
    # Categorical attributes
    'last_purchase_month', 'promotion_end_month', 'product_manufacture_month',
    'month_of_year', 'product_expiry_date_year', 'product_manufacture_year',
    'transaction_year', 'product_expiry_date_month', 'transaction_month',
    'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week',
    'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
    'email_subscriptions', 'store_location', 'high_value_quantity',
    # Numeric attributes
    'customer_support_calls', 'product_review_count', 'days_since_last_purchase',
    'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions',
    'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value',
    'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life',
    'total_returned_items', 'transaction_hour', 'min_single_purchase_value',
    'number_of_children', 'product_stock', 'avg_purchase_value',
    'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_value',
    'avg_spent_per_category', 'total_discounts_received', 'product_return_rate',
    'avg_transaction_value', 'in_store_purchases'
]

# Filter features that exist in train_data
selected_features = [col for col in selected_features if col in train_data.columns]
print("Selected features for Decision Tree:", selected_features)

# Define features (X) and target (y) for train and test sets
X_train = train_data[selected_features]
y_train = train_data['cluster']
X_test = test_data[selected_features]
y_test = test_data['cluster']

# Identify categorical and numeric columns among selected features
categorical_cols = [col for col in selected_features if col in ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year',
                                                                'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase',
                                                                'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
                                                                'email_subscriptions', 'store_location', 'high_value_quantity']]
numeric_cols = [col for col in selected_features if col not in categorical_cols]

# Preprocessing pipeline for encoding and scaling
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# Create pipeline with preprocessing and Decision Tree
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__max_depth': [None, 10, 15, 20, 30],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 4, 8],
    'classifier__criterion': ['gini']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train Decision Tree with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions on test set
y_pred = best_model.predict(X_test)

# Get unique cluster labels for classification report
unique_clusters = sorted(y_train.unique())
target_names = [f'Cluster {i}' for i in unique_clusters]

# Classification Report and Metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Accuracy, Recall, Precision
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

# Calculate Specificity for each cluster
def calculate_specificity(class_idx):
    # True Negatives for the current class
    tn = cm.sum() - cm[:, class_idx].sum() - cm[class_idx, :].sum() + cm[class_idx, class_idx]
    # False Positives for the current class
    fp = cm[:, class_idx].sum() - cm[class_idx, class_idx]
    # Specificity = TN / (TN + FP)
    return tn / (tn + fp) if (tn + fp) > 0 else 0

# Calculate specificity for each cluster
specificity_scores = {f'Cluster {i}': calculate_specificity(idx) for idx, i in enumerate(unique_clusters)}

# Print metrics
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
for cluster, spec in specificity_scores.items():
    print(f"Specificity for {cluster}: {spec:.4f}")

print("-----------------------------------------------")

# --- Cross-Validation on Full Dataset ---
# Combine train and test data for cross-validation
X_full = pd.concat([X_train, X_test], axis=0)
y_full = pd.concat([y_train, y_test], axis=0)

# Perform 5-fold cross-validation using the best model
cv_scores = cross_val_score(best_model, X_full, y_full, cv=5, scoring='accuracy', n_jobs=-1)
print(f"\nCross-Validation Results (5-fold):")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
print(f"Individual Fold Scores: {cv_scores}")


Selected features for Decision Tree: ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year', 'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state', 'email_subscriptions', 'store_location', 'high_value_quantity', 'customer_support_calls', 'product_review_count', 'days_since_last_purchase', 'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions', 'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value', 'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life', 'total_returned_items', 'transaction_hour', 'min_single_purchase_value', 'number_of_children', 'product_stock', 'avg_purchase_value', 'avg_items_per_transaction', 'website_visits', 'age', 'max_single_pur

# Decision Tree - Without Balancing - Selected Features

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

# Verify train_data and test_data exist from previous steps
if 'train_data' not in globals() or 'test_data' not in globals():
    raise ValueError("train_data or test_data not found! Ensure previous steps are executed.")

# Define the attributes as specified
selected_features = [
    # Categorical attributes
    'last_purchase_month', 'promotion_end_month', 'product_manufacture_month',
    'month_of_year', 'product_expiry_date_year', 'product_manufacture_year',
    'transaction_year', 'product_expiry_date_month', 'transaction_month',
    'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week',
    'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
    'email_subscriptions', 'store_location', 'high_value_quantity',
    # Numeric attributes
    'customer_support_calls', 'product_review_count', 'days_since_last_purchase',
    'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions',
    'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value',
    'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life',
    'total_returned_items', 'transaction_hour', 'min_single_purchase_value',
    'number_of_children', 'product_stock', 'avg_purchase_value',
    'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_value',
    'avg_spent_per_category', 'total_discounts_received', 'product_return_rate',
    'avg_transaction_value', 'in_store_purchases'
]

# Filter features that exist in train_data
selected_features = [col for col in selected_features if col in train_data.columns]
print("Initial selected features:", selected_features)

# Define features (X) and target (y) for train and test sets
X_train = train_data[selected_features]
y_train = train_data['cluster']
X_test = test_data[selected_features]
y_test = test_data['cluster']

# Identify categorical and numeric columns among selected features
categorical_cols = [col for col in selected_features if col in ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year',
                                                                'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase',
                                                                'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
                                                                'email_subscriptions', 'store_location', 'high_value_quantity']]
numeric_cols = [col for col in selected_features if col not in categorical_cols]


# Preprocessing pipeline for encoding and scaling
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# Preprocess the data before RFECV to avoid string-to-float issues
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Get transformed feature names
feature_names = preprocessor.get_feature_names_out()

# Initialize RFECV with a simple Decision Tree (no preprocessing needed)
rfecv = RFECV(estimator=DecisionTreeClassifier(random_state=42), step=1, cv=3, scoring='accuracy', n_jobs=-1)
rfecv.fit(X_train_transformed, y_train)

# Get selected features from transformed space
selected_mask = rfecv.support_
selected_transformed_features = feature_names[selected_mask].tolist()
print(f"Number of Features Selected by RFECV: {len(selected_transformed_features)}")
print(f"Selected Transformed Features: {selected_transformed_features}")

# Map transformed features back to original features
# Since one-hot encoding creates multiple columns per categorical feature, we extract unique original feature names
original_selected_features = []
for transformed_feature in selected_transformed_features:
    # Remove prefixes like 'num__' or 'cat__'
    if transformed_feature.startswith('num__'):
        original_feature = transformed_feature.replace('num__', '')
    elif transformed_feature.startswith('cat__'):
        # Extract the original feature name before the category (e.g., 'promotion_effectiveness_Low' -> 'promotion_effectiveness')
        original_feature = transformed_feature.replace('cat__', '').split('_')[0]
    else:
        original_feature = transformed_feature
    if original_feature in selected_features and original_feature not in original_selected_features:
        original_selected_features.append(original_feature)

print(f"Selected Original Features: {original_selected_features}")

# Use selected original features for training and testing
X_train_rfe = X_train[original_selected_features]
X_test_rfe = X_test[original_selected_features]

# Update categorical and numeric columns for the selected features
categorical_cols_rfe = [col for col in original_selected_features if col in categorical_cols]
numeric_cols_rfe = [col for col in original_selected_features if col not in categorical_cols]

# Update preprocessing pipeline for selected features
preprocessor_rfe = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols_rfe),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols_rfe)
])

# Create pipeline with preprocessing and Decision Tree
pipeline_rfe = Pipeline([
    ('preprocessor', preprocessor_rfe),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__max_depth': [None, 10, 15, 20, 30],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 4, 8],
    'classifier__criterion': ['gini']
}

grid_search = GridSearchCV(pipeline_rfe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_rfe, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train Decision Tree with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_rfe, y_train)

# Predictions on test set
y_pred = best_model.predict(X_test_rfe)

# Get unique cluster labels for classification report
unique_clusters = sorted(y_train.unique())
target_names = [f'Cluster {i}' for i in unique_clusters]

# Classification Report and Metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Accuracy, Recall, Precision
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

# Calculate Specificity for each cluster
def calculate_specificity(class_idx):
    # True Negatives for the current class
    tn = cm.sum() - cm[:, class_idx].sum() - cm[class_idx, :].sum() + cm[class_idx, class_idx]
    # False Positives for the current class
    fp = cm[:, class_idx].sum() - cm[class_idx, class_idx]
    # Specificity = TN / (TN + FP)
    return tn / (tn + fp) if (tn + fp) > 0 else 0

# Calculate specificity for each cluster
specificity_scores = {f'Cluster {i}': calculate_specificity(idx) for idx, i in enumerate(unique_clusters)}

# Print metrics
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
for cluster, spec in specificity_scores.items():
    print(f"Specificity for {cluster}: {spec:.4f}")

print("-----------------------------------------------")

# --- Cross-Validation on Full Dataset ---
# Combine train and test data for cross-validation
X_full = pd.concat([X_train_rfe, X_test_rfe], axis=0)
y_full = pd.concat([y_train, y_test], axis=0)

# Perform 5-fold cross-validation using the best model
cv_scores = cross_val_score(best_model, X_full, y_full, cv=5, scoring='accuracy', n_jobs=-1)
print(f"\nCross-Validation Results (5-fold):")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
print(f"Individual Fold Scores: {cv_scores}")

Initial selected features: ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year', 'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state', 'email_subscriptions', 'store_location', 'high_value_quantity', 'customer_support_calls', 'product_review_count', 'days_since_last_purchase', 'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions', 'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value', 'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life', 'total_returned_items', 'transaction_hour', 'min_single_purchase_value', 'number_of_children', 'product_stock', 'avg_purchase_value', 'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_valu

# Decision Tree - With Balancing - All Features

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline as SKPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from collections import Counter
import pandas as pd
import numpy as np

# Define the attributes as specified
selected_features = [
    # Categorical attributes
    'last_purchase_month', 'promotion_end_month', 'product_manufacture_month',
    'month_of_year', 'product_expiry_date_year', 'product_manufacture_year',
    'transaction_year', 'product_expiry_date_month', 'transaction_month',
    'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week',
    'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
    'email_subscriptions', 'store_location', 'high_value_quantity',
    # Numeric attributes
    'customer_support_calls', 'product_review_count', 'days_since_last_purchase',
    'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions',
    'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value',
    'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life',
    'total_returned_items', 'transaction_hour', 'min_single_purchase_value',
    'number_of_children', 'product_stock', 'avg_purchase_value',
    'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_value',
    'avg_spent_per_category', 'total_discounts_received', 'product_return_rate',
    'avg_transaction_value', 'in_store_purchases'
]

# Filter features that exist in train_data
selected_features = [col for col in selected_features if col in train_data.columns]
print("Initial selected features:", selected_features)

# Combine train_data and test_data for balancing
data = pd.concat([train_data, test_data], axis=0)
X = data[selected_features]
y = data['cluster']

# Identify categorical and numeric columns among selected features
categorical_cols = [col for col in selected_features if col in ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year',
                                                                'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase',
                                                                'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
                                                                'email_subscriptions', 'store_location', 'high_value_quantity']]
numeric_cols = [col for col in selected_features if col not in categorical_cols]


# Preprocessing pipeline for encoding and scaling
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# Preprocess the data before balancing to avoid string-to-float issues
X_transformed = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()

# Define balancing strategy: target 2,000 samples per cluster
sampling_strategy_under = {1: 10000}
sampling_strategy_over = {0: 10000, 2: 10000}

# Create a pipeline for undersampling and oversampling
balancing_pipeline = ImbPipeline([
    ('undersample', RandomUnderSampler(sampling_strategy=sampling_strategy_under, random_state=42)),
    ('oversample', SMOTE(sampling_strategy=sampling_strategy_over, random_state=42))
])

# Apply balancing
X_resampled, y_resampled = balancing_pipeline.fit_resample(X_transformed, y)

# Check dataset distribution after balancing
print("After Balancing:", Counter(y_resampled))

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Create pipeline with Decision Tree (no preprocessing needed since data is pre-transformed)
pipeline = SKPipeline([
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__max_depth': [None, 10, 15, 20, 30],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 4, 8],
    'classifier__criterion': ['gini']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train Decision Tree with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions on test set
y_pred = best_model.predict(X_test)

# Get unique cluster labels for classification report
unique_clusters = sorted(np.unique(y_train))
target_names = [f'Cluster {i}' for i in unique_clusters]

# Classification Report and Metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Accuracy, Recall, Precision
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

# Calculate Specificity for each cluster
def calculate_specificity(class_idx):
    tn = cm.sum() - cm[:, class_idx].sum() - cm[class_idx, :].sum() + cm[class_idx, class_idx]
    fp = cm[:, class_idx].sum() - cm[class_idx, class_idx]
    return tn / (tn + fp) if (tn + fp) > 0 else 0

# Calculate specificity for each cluster
specificity_scores = {f'Cluster {i}': calculate_specificity(idx) for idx, i in enumerate(unique_clusters)}

# Print metrics
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
for cluster, spec in specificity_scores.items():
    print(f"Specificity for {cluster}: {spec:.4f}")

print("-----------------------------------------------")

# --- Cross-Validation on Full Dataset ---
# Combine train and test data for cross-validation
X_full = np.vstack([X_train, X_test])
y_full = np.concatenate([y_train, y_test])

# Perform 5-fold cross-validation using the best model
cv_scores = cross_val_score(best_model, X_full, y_full, cv=5, scoring='accuracy', n_jobs=-1)
print(f"\nCross-Validation Results (5-fold):")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
print(f"Individual Fold Scores: {cv_scores}")

Initial selected features: ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year', 'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state', 'email_subscriptions', 'store_location', 'high_value_quantity', 'customer_support_calls', 'product_review_count', 'days_since_last_purchase', 'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions', 'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value', 'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life', 'total_returned_items', 'transaction_hour', 'min_single_purchase_value', 'number_of_children', 'product_stock', 'avg_purchase_value', 'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_valu

# Decision Tree - With Balancing - Selected Features

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline as SKPipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from collections import Counter
import pandas as pd
import numpy as np

# Verify train_data and test_data exist from previous steps
if 'train_data' not in globals() or 'test_data' not in globals():
    raise ValueError("train_data or test_data not found! Ensure previous steps are executed.")

# Define the attributes as specified
selected_features = [
    # Categorical attributes
    'last_purchase_month', 'promotion_end_month', 'product_manufacture_month',
    'month_of_year', 'product_expiry_date_year', 'product_manufacture_year',
    'transaction_year', 'product_expiry_date_month', 'transaction_month',
    'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week',
    'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
    'email_subscriptions', 'store_location', 'high_value_quantity',
    # Numeric attributes
    'customer_support_calls', 'product_review_count', 'days_since_last_purchase',
    'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions',
    'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value',
    'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life',
    'total_returned_items', 'transaction_hour', 'min_single_purchase_value',
    'number_of_children', 'product_stock', 'avg_purchase_value',
    'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_value',
    'avg_spent_per_category', 'total_discounts_received', 'product_return_rate',
    'avg_transaction_value', 'in_store_purchases'
]

# Filter features that exist in train_data
selected_features = [col for col in selected_features if col in train_data.columns]
print("Initial selected features:", selected_features)

# Combine train_data and test_data for balancing
data = pd.concat([train_data, test_data], axis=0)
X = data[selected_features]
y = data['cluster']

# Identify categorical and numeric columns among selected features
categorical_cols = [col for col in selected_features if col in ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year',
                                                                'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase',
                                                                'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state',
                                                                'email_subscriptions', 'store_location', 'high_value_quantity']]
numeric_cols = [col for col in selected_features if col not in categorical_cols]


# Preprocessing pipeline for encoding and scaling
preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_cols)
])

# Preprocess the data before balancing to avoid string-to-float issues
X_transformed = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names_out()

# Define balancing strategy: target 2,000 samples per cluster
sampling_strategy_under = {1: 10000}
sampling_strategy_over = {0: 10000, 2: 10000}

# Create a pipeline for undersampling and oversampling
balancing_pipeline = ImbPipeline([
    ('undersample', RandomUnderSampler(sampling_strategy=sampling_strategy_under, random_state=42)),
    ('oversample', SMOTE(sampling_strategy=sampling_strategy_over, random_state=42))
])

# Apply balancing
X_resampled, y_resampled = balancing_pipeline.fit_resample(X_transformed, y)

# Check dataset distribution after balancing
print("After Balancing:", Counter(y_resampled))

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Initialize RFECV with a simple Decision Tree (no preprocessing needed)
rfecv = RFECV(estimator=DecisionTreeClassifier(random_state=42), step=1, cv=3, scoring='accuracy', n_jobs=-1)
rfecv.fit(X_train, y_train)

# Get selected features from transformed space
selected_mask = rfecv.support_
selected_transformed_features = feature_names[selected_mask].tolist()
print(f"Number of Features Selected by RFECV: {len(selected_transformed_features)}")
print(f"Selected Transformed Features: {selected_transformed_features}")

# Map transformed features back to original features
original_selected_features = []
for transformed_feature in selected_transformed_features:
    if transformed_feature.startswith('num__'):
        original_feature = transformed_feature.replace('num__', '')
    elif transformed_feature.startswith('cat__'):
        original_feature = transformed_feature.replace('cat__', '').split('_')[0]
    else:
        original_feature = transformed_feature
    if original_feature in selected_features and original_feature not in original_selected_features:
        original_selected_features.append(original_feature)

print(f"Selected Original Features: {original_selected_features}")

# Since X_train and X_test are already transformed, we need to select the corresponding transformed feature indices
selected_indices = [i for i, name in enumerate(feature_names) if name in selected_transformed_features]
X_train_rfe = X_train[:, selected_indices]
X_test_rfe = X_test[:, selected_indices]

# Create pipeline with Decision Tree (no preprocessing needed since data is pre-transformed)
pipeline_rfe = Pipeline([
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'classifier__max_depth': [None, 10, 15, 20, 30],
    'classifier__min_samples_split': [2, 5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 4, 8],
    'classifier__criterion': ['gini']
}

grid_search = GridSearchCV(pipeline_rfe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_rfe, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train Decision Tree with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_rfe, y_train)

# Predictions on test set
y_pred = best_model.predict(X_test_rfe)

# Get unique cluster labels for classification report
unique_clusters = sorted(np.unique(y_train))
target_names = [f'Cluster {i}' for i in unique_clusters]

# Classification Report and Metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Accuracy, Recall, Precision
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

# Calculate Specificity for each cluster
def calculate_specificity(class_idx):
    tn = cm.sum() - cm[:, class_idx].sum() - cm[class_idx, :].sum() + cm[class_idx, class_idx]
    fp = cm[:, class_idx].sum() - cm[class_idx, class_idx]
    return tn / (tn + fp) if (tn + fp) > 0 else 0

# Calculate specificity for each cluster
specificity_scores = {f'Cluster {i}': calculate_specificity(idx) for idx, i in enumerate(unique_clusters)}

# Print metrics
print(f"\nAccuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
for cluster, spec in specificity_scores.items():
    print(f"Specificity for {cluster}: {spec:.4f}")

print("-----------------------------------------------")

# --- Cross-Validation on Full Dataset ---
# Combine train and test data for cross-validation
X_full = np.vstack([X_train_rfe, X_test_rfe])
y_full = np.concatenate([y_train, y_test])

# Perform 5-fold cross-validation using the best model
cv_scores = cross_val_score(best_model, X_full, y_full, cv=5, scoring='accuracy', n_jobs=-1)
print(f"\nCross-Validation Results (5-fold):")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
print(f"Individual Fold Scores: {cv_scores}")

Initial selected features: ['last_purchase_month', 'promotion_end_month', 'product_manufacture_month', 'month_of_year', 'product_expiry_date_year', 'product_manufacture_year', 'transaction_year', 'product_expiry_date_month', 'transaction_month', 'high_value_purchase', 'week_of_year', 'promotion_start_month', 'day_of_week', 'purchase_frequency', 'customer_city', 'gender', 'weekend', 'store_state', 'email_subscriptions', 'store_location', 'high_value_quantity', 'customer_support_calls', 'product_review_count', 'days_since_last_purchase', 'online_purchases', 'distance_to_store', 'product_rating', 'total_transactions', 'product_weight', 'total_items_purchased', 'unit_price', 'total_returned_value', 'membership_years', 'discount_applied', 'avg_discount_used', 'product_shelf_life', 'total_returned_items', 'transaction_hour', 'min_single_purchase_value', 'number_of_children', 'product_stock', 'avg_purchase_value', 'avg_items_per_transaction', 'website_visits', 'age', 'max_single_purchase_valu