In [None]:
import pandas as pd
import xgboost as xgb
import os
from sklearn.metrics import precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from category_encoders import HashingEncoder
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Data file not found at {file_path}")
    return pd.read_csv(file_path)

def preprocess_data(df):
    # 1. Basic Cleaning
    df = df.drop(columns=['car', 'toCoupon_GEQ5min', 'direction_opp'])
    df = df.fillna(df.mode().iloc[0])
    df = df.drop_duplicates()

    # 2. Manual Encoding (Ensures ordinal relationships are preserved)
    encoding_map = {
        'expiration': {'2h': 0, '1d': 1},
        'age': {'<21': 0, '21-30': 1, '31-40': 2, '41-50': 3, '>50': 4},
        'education': {
            'Some High School': 0, 'High School Graduate': 1, 'Some college - no degree': 2,
            'Associates degree': 3, 'Bachelors degree': 4, 'Graduate degree (Masters or Doctorate)': 5
        },
        'Bar': {'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4},
        'CoffeeHouse': {'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4}, 
        'CarryAway': {'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4}, 
        'Restaurant20To50': {'never': 0, 'less1': 1, '1~3': 2, '4~8': 3, 'gt8': 4},
        'income': {
            'Less than $12500': 0, '$12500 - $24999': 1, '$25000 - $37499': 2, '$37500 - $49999': 3,
            '$50000 - $62499': 4, '$62500 - $74999': 5, '$75000 - $87499': 6, '$87500 - $99999': 7,
            '$100000 or More': 8
        },
        'time': {'7AM': 0, '10AM': 1, '2PM': 2, '6PM': 3, '10PM': 4}
    }

    for col, mapping in encoding_map.items():
        if col in df.columns:
            # .map is more direct than .replace for this use case
            df[col] = df[col].map(mapping).fillna(-1).astype(int)

    # 3. Engineered Features
    df['passanger_destination'] = df['passanger'].astype(str) + '-' + df['destination'].astype(str)
    df['marital_hasChildren'] = df['maritalStatus'].astype(str) + '-' + df['has_children'].astype(str)
    df['temperature_weather'] = df['temperature'].astype(str) + '-' + df['weather'].astype(str)
    
    # Drop original categorical components
    df = df.drop(columns=['passanger', 'destination', 'maritalStatus', 'has_children', 
                          'temperature', 'weather', 'gender', 'RestaurantLessThan20'])
    
    x = df.drop('Y', axis=1)
    y = df.Y
    return x, y

def train_model(model_name, x_train, y_train):
    if model_name == 'xgboost':
        # Explicitly use the Classifier class
        model = XGBClassifier(
            random_state=42, 
            learning_rate=0.2, 
            n_estimators=45, 
            max_depth=10,
            use_label_encoder=False  # Good practice for older versions
        )
    else:
        raise ValueError("Invalid model name.")

    model.fit(x_train, y_train)
    
    # Debug check: Ensure the model knows it's a classifier
    print(f"Estimator type: {getattr(model, '_estimator_type', 'UNDEFINED')}")
    
    return model

def save_model(model, file_path):
    # Ensure we use the .json extension
    json_path = file_path.replace(".pkl", ".json")
    
    # Access the underlying booster to bypass the scikit-learn wrapper bug
    # This is the "pure" XGBoost model engine
    model.get_booster().save_model(json_path)
    
    print(f"✅ Model successfully saved to {json_path}")

# --- EXECUTION FLOW ---

# 1. Load and Preprocess
df = load_data("data/in-vehicle-coupon-recommendation.csv")
x, y = preprocess_data(df)

# 2. Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# 3. Handle Remaining Categoricals via Hashing
# These are columns we didn't manually encode above
hash_cols = ['passanger_destination', 'marital_hasChildren', 'occupation', 'coupon', 'temperature_weather']
hashing_enc = HashingEncoder(cols=hash_cols, n_components=27).fit(x_train)

x_train_hashing = hashing_enc.transform(x_train.reset_index(drop=True))
x_test_hashing = hashing_enc.transform(x_test.reset_index(drop=True))

# 4. Final Type Check (Crucial for XGBoost)
non_numeric_cols = x_train_hashing.select_dtypes(exclude=['number']).columns.tolist()
if non_numeric_cols:
    print(f"⚠️ Warning: Still have non-numeric columns: {non_numeric_cols}")
    # Force conversion of any straggler objects to numeric
    for col in non_numeric_cols:
        x_train_hashing[col] = pd.to_numeric(x_train_hashing[col], errors='coerce').fillna(0)

# 5. Over-sampling with SMOTE
sm = SMOTE(random_state=42)
x_sm_train, y_sm_train = sm.fit_resample(x_train_hashing, y_train)

# 6. Train and Save
model = train_model('xgboost', x_sm_train, y_sm_train)
save_model(model, "artifacts/xgboost_coupon_recommendation.json")

# 7. Quick Evaluation
preds = model.predict(x_test_hashing)
print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")

Estimator type: UNDEFINED


TypeError: `_estimator_type` undefined.  Please use appropriate mixin to define estimator type.

In [None]:
!pip3 show xgboost