# E-commerce Classification: Baseline Selection & Tuning

This notebook uses a **Two-Stage** approach:
1. **Baseline Evaluation**: Train all candidate models with default parameters to identify the most promising architectures.
2. **Hyperparameter Tuning**: Use Optuna to optimize ONLY the **Top 5** performing baseline models.
3. **Ensemble**: Build Stacking and Voting ensembles from the tuned top models.

## Candidate Models
- Logistic Regression, Random Forest, Extra Trees, XGBoost, LightGBM, CatBoost, SVM, KNN

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

warnings.filterwarnings('ignore')

## 1. Data Loading & Preprocessing

In [None]:
train_df = pd.read_csv('data/train_df.csv')
test_df = pd.read_csv('data/test_df.csv')
train_df.info()

In [None]:
def apply_feature_engineering(df):
    # 1. Discount_Group 생성
    df['Discount_Group'] = (df['Discount_offered'] > 10).astype(int)
    
    # 2. Weight_Category 생성
    bins = [0, 2000, 4000, float('inf')]
    labels = ['light', 'medium', 'heavy']
    # 'Weight_Category' 생성 후 바로 숫자로 인코딩 (Ordinal)
    df['Weight_Category'] = pd.cut(df['Weight_in_gms'], bins=bins, labels=labels)
    # 모델 입력을 위해 숫자로 변환 (light:0, medium:1, heavy:2)
    df['Weight_Category'] = df['Weight_Category'].map({'light': 0, 'medium': 1, 'heavy': 2}).astype(int)
    
    # 3. log_rel_price 생성
    # 무게가 0인 경우를 대비해 아주 작은 값을 더하거나 예외처리 (이 데이터셋은 보통 양수임)
    df['log_rel_price'] = np.log(df['Cost_of_the_Product'] / df['Weight_in_gms'])
    
    # 4. Product_importance: Label Encoding (순서가 있으므로 수동 매핑)
    importance_map = {'low': 0, 'medium': 1, 'high': 2}
    df['Product_importance'] = df['Product_importance'].map(importance_map)

    # 5. prior purchase 6이상 묶기
    df['Prior_purchases'] = df['Prior_purchases'].clip(upper=6)

    #  창고별 처리 물량: 병목현상으로 인한 지연 위험
    df['Warehouse_Load'] = df.groupby('Warehouse_block')['Warehouse_block'].transform('count')

    # 상담 빈도 압박: 이전 구매 횟수 대비 상담 전화 빈도
    df['Care_Pressure'] = df['Customer_care_calls'] / (df['Prior_purchases'] + 1)

    # 고객 우선순위 지수
    df['Priority_Score'] = df['Product_importance']/ (df['Customer_rating'] + 1)

    # 예상 배송 민감도
    df['Expectation_Gap'] = df['Cost_of_the_Product'] * df['Discount_offered']

    # 배송부담지수: 운송 수단별 평균 무게 대비 해당 화물의 무게 비중
    df['Shipping_Burden'] = df['Weight_in_gms'] / df.groupby('Mode_of_Shipment')['Weight_in_gms'].transform('mean')

    # 고위험품목: 고가이면서 무거운 제품
    # df['Is_High_Risk'] = ((df['Cost_of_the_Product'] > df['Cost_of_the_Product'].median()) & (df['Weight_in_gms'] > df['Weight_in_gms'].median())).astype(int)
   
    # 1. 창고별 배송수단 조합 (예: 'A_Flight', 'F_Ship')
    # 단순 결합만으로도 특정 창고+수단 조합의 지연 패턴을 포착할 수 있습니다.
    df['Wh_Shipment_Combo'] = df['Warehouse_block'].astype(str) + '_' + df['Mode_of_Shipment'].astype(str)
    
    # 모델 학습을 위해 범주형을 숫자로 매핑 (Label Encoding과 유사)
    # AutoGluon을 쓰신다면 문자열 그대로 두어도 되지만, Scikit-learn 모델을 위해 숫자로 변환합니다.
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    df['Wh_Shipment_Combo'] = le.fit_transform(df['Wh_Shipment_Combo']) + 1

    return df

# 각각의 데이터셋에 적용
train_df = apply_feature_engineering(train_df)
test_df = apply_feature_engineering(test_df)

In [None]:
drop_columns = ["ID", "Gender", "Discount_offered", "Weight_in_gms",'Warehouse_block' ]
train_df = train_df.drop(drop_columns, axis=1)
test_df = test_df.drop(drop_columns, axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

# 1. One-Hot Encoding (Warehouse_block, Mode_of_Shipment)
# drop_first=True를 통해 다중공선성을 방지합니다.
train_df = pd.get_dummies(train_df, columns=['Mode_of_Shipment'])
test_df = pd.get_dummies(test_df, columns=['Mode_of_Shipment'])

# 2. Scaling 대상 컬럼 선정
# 새로 만든 log_rel_price와 기존 수치형 변수들을 포함합니다.
scale_cols = train_df.select_dtypes(include=['number']).columns.drop('Reached.on.Time_Y.N').tolist()

scaler = StandardScaler()

# 중요: fit은 오직 train_df에만 수행합니다!
train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])

# test와 eval은 transform만 수행하여 데이터 누수를 차단합니다.
test_df[scale_cols] = scaler.transform(test_df[scale_cols])

In [None]:
# train_df를 X_train, y_train, X_test, y_test으로 분리

target = 'Reached.on.Time_Y.N'
X = train_df.drop(label, axis=1)
y = train_df[label]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 2. Stage 1: Baseline Evaluation
Running all models with default parameters to select the Top 5.

In [None]:
baseline_models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'RandomForest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'ExtraTrees': ExtraTreesClassifier(random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
    'LightGBM': LGBMClassifier(random_state=42, verbose=-1, n_jobs=-1),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0, allow_writing_files=False),
    'SVM': SVC(random_state=42, probability=True), # Default RBF
    'KNN': KNeighborsClassifier(n_jobs=-1)
}

baseline_results = []

print("--- Running Baseline Evaluation ---")
for name, model in baseline_models.items():
    model.fit(X_train, y_train)
    probs = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, probs)
    acc = accuracy_score(y_val, model.predict(X_val))
    
    baseline_results.append({'Model': name, 'ROC-AUC': auc, 'Accuracy': acc})
    print(f"{name}: AUC={auc:.4f}, Acc={acc:.4f}")

baseline_df = pd.DataFrame(baseline_results).sort_values(by='ROC-AUC', ascending=False)
top_5_names = baseline_df.head(5)['Model'].tolist()
print(f"\nSelected Top 5 for Tuning: {top_5_names}")

## 3. Stage 2: Optuna Optimization (Top 5)
Tuning only the selected high-potential models.

In [None]:
tuned_models = {}
final_results = []

def optimize_model(trial, model_name):
    # Define search spaces (same as before, condensed for brevity)
    if model_name == 'LogisticRegression':
        params = {'C': trial.suggest_float('C', 1e-4, 100, log=True), 'solver': trial.suggest_categorical('solver', ['liblinear', 'lbfgs'])}
        model = LogisticRegression(**params, random_state=42, max_iter=1000)
    elif model_name == 'RandomForest':
        params = {'n_estimators': trial.suggest_int('n_estimators', 50, 300), 'max_depth': trial.suggest_int('max_depth', 3, 20)}
        model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    elif model_name == 'ExtraTrees':
        params = {'n_estimators': trial.suggest_int('n_estimators', 50, 300), 'max_depth': trial.suggest_int('max_depth', 3, 20)}
        model = ExtraTreesClassifier(**params, random_state=42, n_jobs=-1)
    elif model_name == 'XGBoost':
        params = {'n_estimators': trial.suggest_int('n_estimators', 50, 300), 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True), 'max_depth': trial.suggest_int('max_depth', 3, 10)}
        model = XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
    elif model_name == 'LightGBM':
        params = {'n_estimators': trial.suggest_int('n_estimators', 50, 300), 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True), 'num_leaves': trial.suggest_int('num_leaves', 20, 100)}
        model = LGBMClassifier(**params, random_state=42, verbose=-1, n_jobs=-1)
    elif model_name == 'CatBoost':
        params = {'iterations': trial.suggest_int('iterations', 50, 300), 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True), 'depth': trial.suggest_int('depth', 3, 10)}
        model = CatBoostClassifier(**params, random_state=42, verbose=0, allow_writing_files=False)
    elif model_name == 'SVM':
        params = {'C': trial.suggest_float('C', 0.1, 100, log=True), 'gamma': trial.suggest_categorical('gamma', ['scale', 'auto'])}
        model = SVC(**params, random_state=42, probability=True)
    elif model_name == 'KNN':
        params = {'n_neighbors': trial.suggest_int('n_neighbors', 3, 30), 'weights': trial.suggest_categorical('weights', ['uniform', 'distance'])}
        model = KNeighborsClassifier(**params, n_jobs=-1)
        
    model.fit(X_train, y_train)
    return roc_auc_score(y_val, model.predict_proba(X_val)[:, 1])

print("\n--- Starting Optuna Tuning for Top 5 ---")
# Helper to reconstruct model from params
def get_model_instance(name, params):
    if name == 'LogisticRegression': return LogisticRegression(**params, random_state=42, max_iter=1000)
    if name == 'RandomForest': return RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    if name == 'ExtraTrees': return ExtraTreesClassifier(**params, random_state=42, n_jobs=-1)
    if name == 'XGBoost': return XGBClassifier(**params, random_state=42, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
    if name == 'LightGBM': return LGBMClassifier(**params, random_state=42, verbose=-1, n_jobs=-1)
    if name == 'CatBoost': return CatBoostClassifier(**params, random_state=42, verbose=0, allow_writing_files=False)
    if name == 'SVM': return SVC(**params, random_state=42, probability=True)
    if name == 'KNN': return KNeighborsClassifier(**params, n_jobs=-1)
    return None

for name in top_5_names:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: optimize_model(trial, name), n_trials=2) # 2 trials per model for speed
    
    best_model = get_model_instance(name, study.best_params)
    best_model.fit(X_train, y_train)
    tuned_models[name] = best_model
    
    # Evaluating tuned model
    probs = best_model.predict_proba(X_val)[:, 1]
    metrics = {
        'Model': f"{name} (Tuned)",
        'ROC-AUC': roc_auc_score(y_val, probs),
        'Accuracy': accuracy_score(y_val, best_model.predict(X_val)),
        'F1-Score': f1_score(y_val, best_model.predict(X_val))
    }
    final_results.append(metrics)
    print(f"{name} Tuned: AUC={metrics['ROC-AUC']:.4f}")

## 4. Ensemble Models (Using Tuned Top 5)
Combining the optimized models.

In [None]:
estimators = [(name, model) for name, model in tuned_models.items()]

# Voting
vote = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
vote.fit(X_train, y_train)
final_results.append({
    'Model': 'Vote_Ensemble',
    'ROC-AUC': roc_auc_score(y_val, vote.predict_proba(X_val)[:, 1]),
    'Accuracy': accuracy_score(y_val, vote.predict(X_val)),
    'F1-Score': f1_score(y_val, vote.predict(X_val))
})

# Stacking
stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5, n_jobs=-1)
stack.fit(X_train, y_train)
final_results.append({
    'Model': 'Stack_Ensemble',
    'ROC-AUC': roc_auc_score(y_val, stack.predict_proba(X_val)[:, 1]),
    'Accuracy': accuracy_score(y_val, stack.predict(X_val)),
    'F1-Score': f1_score(y_val, stack.predict(X_val))
})

tuned_models['Vote_Ensemble'] = vote
tuned_models['Stack_Ensemble'] = stack

results_df = pd.DataFrame(final_results).sort_values(by='ROC-AUC', ascending=False)
print("\n--- Final Model Leaderboard ---")
print(results_df)

## 5. Final Predictions

In [None]:
final_preds = pd.DataFrame({'ID': test_ids}) if 'test_ids' in locals() else pd.DataFrame()

for name in results_df['Model'].head(5):
    # Strip '(Tuned)' if present to match key
    key = name.replace(' (Tuned)', '')
    model = tuned_models[key]
    final_preds[f'{name}_Prob'] = model.predict_proba(X_test_final)[:, 1]
    final_preds[f'{name}_Pred'] = model.predict(X_test_final)

final_preds.to_csv('data/final_predictions.csv', index=False)
print("Predictions saved to data/final_predictions.csv")
results_df.to_csv('data/final_performance.csv', index=False)