In [7]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report

## Load & Prepare Training Data

In [None]:
df = pd.read_csv('train.csv')

In [None]:
# Select only the required base columns
base_cols = [
    "user_id", "total_orders", "total_returns", "days_to_return_avg",
    "high_value_returns", "category_return_ratio", "exchange_ratio", "damaged_returns"
]

In [None]:
df = df[base_cols + ['label']].copy()
df['user_id'] = df['user_id'].astype(str).str[-5:]
df['label'] = df['label'].astype(int)

## Feature Engineering

In [None]:
df['return_rate'] = df['total_returns'] / (df['total_orders'] + 1e-5)
df['fast_return_flag'] = (df['days_to_return_avg'] < 3).astype(int)

In [None]:
# Drop leaky features
df.drop(columns=['user_id', 'high_value_returns', 'damaged_returns'], inplace=True, errors='ignore')

In [None]:
# Encode categoricals 
cat_cols = ['location', 'device_fingerprint']
cat_cols = [col for col in cat_cols if col in df.columns]  # Handle absence
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
if cat_cols:
    df[cat_cols] = ordinal_encoder.fit_transform(df[cat_cols])
    joblib.dump(ordinal_encoder, 'ordinal_encoder.pkl')

In [None]:
# Define features and target
X = df.drop(columns=['label'])
y = df['label']


In [None]:
joblib.dump(X.columns.tolist(), 'feature_columns.pkl')

In [None]:
# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, 'scaler.pkl')

## Train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

## Training

In [None]:
cat = CatBoostClassifier(verbose=0)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
lgb = LGBMClassifier()

cat.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgb.fit(X_train, y_train)

joblib.dump(cat, 'catboost_model.pkl')
joblib.dump(xgb, 'xgboost_model.pkl')
joblib.dump(lgb, 'lightgbm_model.pkl')

In [None]:
# Evaluate models
for name, model in zip(['CatBoost', 'XGBoost', 'LightGBM'], [cat, xgb, lgb]):
    preds = model.predict(X_test)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, preds):.4f}")
    print(f"{name} Classification Report:\n{classification_report(y_test, preds)}")


In [None]:
# Voting Ensemble 
ensemble = VotingClassifier(
    estimators=[('cat', cat), ('xgb', xgb), ('lgb', lgb)],
    voting='soft'
)
ensemble.fit(X_train, y_train)
joblib.dump(ensemble, 'voting_ensemble_model.pkl')

In [None]:
# Evaluate ensemble
ensemble_preds = ensemble.predict(X_test)
print(f"\n🧠 Voting Ensemble Accuracy: {accuracy_score(y_test, ensemble_preds):.4f}")
print(f"Voting Ensemble Classification Report:\n{classification_report(y_test, ensemble_preds)}")


## Load & Prepare Test Data

In [9]:
test_df = pd.read_csv("test.csv")

In [None]:
# Same process
base_cols = [
    "user_id", "total_orders", "total_returns", "days_to_return_avg",
    "high_value_returns", "category_return_ratio", "exchange_ratio", "damaged_returns"
]
test_df = test_df[base_cols + (['label'] if 'label' in test_df.columns else [])].copy()
test_df['user_id'] = test_df['user_id'].astype(str).str[-5:]

test_df['return_rate'] = test_df['total_returns'] / (test_df['total_orders'] + 1e-5)
test_df['fast_return_flag'] = (test_df['days_to_return_avg'] < 3).astype(int)

# Save user IDs
user_ids = test_df['user_id']

test_df.drop(columns=['user_id', 'high_value_returns', 'damaged_returns'], inplace=True, errors='ignore')

cat_cols = ['location', 'device_fingerprint']
cat_cols = [col for col in cat_cols if col in test_df.columns]
if cat_cols:
    ordinal_encoder = joblib.load('ordinal_encoder.pkl')
    test_df[cat_cols] = ordinal_encoder.transform(test_df[cat_cols])

has_label = 'label' in test_df.columns
if has_label:
    y_test_true = test_df['label'].astype(int)

X_test_raw = test_df.drop(columns=['label'], errors='ignore')

expected_cols = joblib.load("feature_columns.pkl")

for col in expected_cols:
    if col not in X_test_raw.columns:
        X_test_raw[col] = 0

X_test_raw = X_test_raw[expected_cols]

scaler = joblib.load('scaler.pkl')
X_test_scaled = scaler.transform(X_test_raw)


In [None]:
ensemble = joblib.load('voting_ensemble_model.pkl')

# Predict
preds = ensemble.predict(X_test_scaled)

In [None]:
# Evaluate
if has_label:
    acc = accuracy_score(y_test_true, preds)
    print(f"\n Ensemble Accuracy on test.csv: {acc:.4f}")
    print(f"Classification Report:\n{classification_report(y_test_true, preds)}")

results_df = pd.DataFrame({
    'user_id': user_ids,
    'prediction': preds
})

if has_label:
    results_df['True_Label'] = y_test_true

print("\n Sample Predictions:")
print(results_df.head(10))