# Decision Tree Classifier for Fashion Recommendations

This notebook builds a Decision Tree classifier to predict target groups (recommendation categories) based on user characteristics and fashion preferences.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

## Load and Prepare Data

In [None]:
# Load dataset
df = pd.read_csv('../data/raw/fashion_survey_integrated.csv')

print(f"Original dataset shape: {df.shape}")
df.head()

In [None]:
# Select features and target
# Exclude sock preference rankings from features (they're used to determine target_group)
sock_cols = [col for col in df.columns if 'sock_' in col]
feature_cols = [col for col in df.columns if col not in sock_cols + ['target_group']]

X = df[feature_cols]
y = df['target_group']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures used: {feature_cols}")

## Data Preprocessing

In [None]:
# Normalize features using Min-Max scaling
scaler = MinMaxScaler()
X_normalized = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)

print("Normalized features:")
X_normalized.head()

In [None]:
# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X_normalized, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Maintain class distribution
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTarget distribution in training set:")
print(y_train.value_counts().sort_index())
print(f"\nTarget distribution in test set:")
print(y_test.value_counts().sort_index())

## Train Decision Tree Model

In [None]:
# Initialize and train Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
y_train_pred = dt_model.predict(X_train)
y_test_pred = dt_model.predict(X_test)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Decision Tree Results:")
print(f"  Training Accuracy: {train_accuracy:.4f}")
print(f"  Test Accuracy: {test_accuracy:.4f}")

## Train Random Forest Model (for comparison)

In [None]:
# Initialize and train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

# Calculate accuracy
train_accuracy_rf = accuracy_score(y_train, y_train_pred_rf)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)

print(f"Random Forest Results:")
print(f"  Training Accuracy: {train_accuracy_rf:.4f}")
print(f"  Test Accuracy: {test_accuracy_rf:.4f}")

## Model Evaluation: Confusion Matrix

In [None]:
# Plot confusion matrices side by side
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Decision Tree confusion matrix
cm_dt = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title(f'Decision Tree\nAccuracy: {test_accuracy:.3f}')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

# Random Forest confusion matrix
cm_rf = confusion_matrix(y_test, y_test_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title(f'Random Forest\nAccuracy: {test_accuracy_rf:.3f}')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.show()

## Classification Reports

In [None]:
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=[f'Group {i}' for i in range(1, 5)]))

print("\n" + "="*60)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_test_pred_rf, target_names=[f'Group {i}' for i in range(1, 5)]))

## Feature Importance Analysis

In [None]:
# Get feature importance from Decision Tree
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': dt_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot top 15 features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['Importance'])
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Importance')
plt.title('Top 15 Feature Importances (Decision Tree)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))

## Visualize Decision Tree

In [None]:
# Visualize the tree (limited depth for clarity)
plt.figure(figsize=(20, 10))
plot_tree(
    dt_model,
    max_depth=3,  # Show only top 3 levels
    feature_names=X_train.columns,
    class_names=[f'Group {i}' for i in range(1, 5)],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title('Decision Tree Visualization (Depth = 3)', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

## Save Models

In [None]:
import joblib
from pathlib import Path

# Create models directory
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save models and scaler
joblib.dump(dt_model, models_dir / 'decision_tree_model.pkl')
joblib.dump(rf_model, models_dir / 'random_forest_model.pkl')
joblib.dump(scaler, models_dir / 'feature_scaler.pkl')

print("✓ Models saved successfully!")
print(f"  - Decision Tree: {models_dir / 'decision_tree_model.pkl'}")
print(f"  - Random Forest: {models_dir / 'random_forest_model.pkl'}")
print(f"  - Scaler: {models_dir / 'feature_scaler.pkl'}")

## Summary

### Model Performance

We trained two models to predict fashion recommendation groups:

1. **Decision Tree Classifier**: Good interpretability with decent accuracy
2. **Random Forest Classifier**: Ensemble method with improved generalization

### Key Findings

- Feature importance analysis reveals which characteristics most influence recommendations
- Fashion involvement and MBTI traits play significant roles
- The models can be used for real-time recommendation in a web application

### Next Steps

1. Fine-tune hyperparameters for better performance
2. Try other algorithms (SVM, XGBoost, Neural Networks)
3. Deploy the best model in a production environment