In [1]:
import warnings

In [3]:
warnings.filterwarnings('ignore')

In [5]:
from pathlib import Path

In [6]:
import numpy as np

In [7]:
import pandas as pd

In [8]:
from sklearn.model_selection import StratifiedKFold, cross_validate

In [9]:
from sklearn.pipeline import Pipeline

In [10]:
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, f1_score
)

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
from sklearn.compose import ColumnTransformer

In [41]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [42]:
from sklearn.impute import SimpleImputer

In [56]:
from xgboost import XGBClassifier

In [13]:
RANDOM_STATE = 42

In [14]:
PROJECT_ROOT = Path('.')

In [15]:
FEATURES_DIR = PROJECT_ROOT / 'data' / 'features'

In [16]:
TARGET_COL = 'revenue'

In [17]:
train_path = FEATURES_DIR / 'train_safe_fe.csv'

In [18]:
valid_path = FEATURES_DIR / 'valid_safe_fe.csv'

In [19]:
test_path = FEATURES_DIR / 'test_safe_fe.csv'

In [20]:
train_df = pd.read_csv(train_path)

In [21]:
valid_df = pd.read_csv(valid_path)

In [22]:
test_df = pd.read_csv(test_path)

In [23]:
train_df.shape, valid_df.shape,test_df.shape

((8543, 31), (2563, 31), (1099, 31))

In [24]:
def split_xy(df:pd.DataFrame, target_col:str = TARGET_COL):
    X = df.drop(columns = [target_col])
    y = df[target_col].astype(int)
    return X, y

In [25]:
X_train, y_train = split_xy(train_df)

In [26]:
X_valid, y_valid = split_xy(valid_df)

In [27]:
X_test, y_test = split_xy(test_df)

In [28]:
X_train.shape, X_valid.shape, X_test.shape

((8543, 30), (2563, 30), (1099, 30))

In [29]:
assert list(X_train.columns) == list(X_valid.columns) == list(X_test.columns), 'Column mismatch!'

In [31]:
assert y_train.isin([0,1]).all() and y_valid.isin([0,1]).all() and y_test.isin([0,1]).all(), 'Target must be binary'

In [32]:
y_train.value_counts(normalize=True)

revenue
0    0.843615
1    0.156385
Name: proportion, dtype: float64

In [33]:
def evaluate_on_valid(model, X_train, y_train,X_valid,y_valid):
    model.fit(X_train,y_train)
    proba = model.predict_proba(X_valid)[:,1]
    pred = (proba >= 0.5).astype(int)
    return {
        'roc_auc': roc_auc_score(y_valid,proba),
        'precision': precision_score(y_valid,pred,zero_division = 0),
        'recall': recall_score(y_valid,pred,zero_division = 0),
        'f1': f1_score(y_valid,pred,zero_division = 0),
    }

In [34]:
cv = StratifiedKFold(n_splits = 5, shuffle = True,random_state = RANDOM_STATE)

In [35]:
scoring = {
    'roc_auc': 'roc_auc',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
}

In [39]:
def cv_scores(model,X,y):
    res = cross_validate(model,X,y,cv=cv,scoring=scoring,n_jobs =1)
    return {k:float(np.mean(v)) for k,v in res.items() if k.startswith('test_')}

In [45]:
def build_preprocess_pipeline(X:pd.DataFrame):
    cat_cols = X.select_dtypes(include = ['object']).columns.tolist()
    num_cols = [c for c in X.columns if c not in cat_cols]
    numeric_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown ='ignore'))
                        
    ])

    preprocessor = ColumnTransformer(
        transformers = [
            ('num', numeric_transformer, num_cols),
            ('cat', categorical_transformer, cat_cols)
        ],
        remainder = 'drop'
    )
    return preprocessor

In [46]:
preprocess = build_preprocess_pipeline(X_train)

In [47]:
log_reg = Pipeline(steps = [
    ('prep', preprocess),
    ('model', LogisticRegression(
        max_iter = 2000,
        class_weight = 'balanced',
        random_state = RANDOM_STATE
    ))
])

In [48]:
log_cv = cv_scores(log_reg, X_train,y_train)

In [49]:
log_valid = evaluate_on_valid(log_reg,X_train,y_train,X_valid,y_valid)

In [50]:
log_cv, log_valid

({'test_roc_auc': 0.754038191969903,
  'test_precision': 0.26694272925354423,
  'test_recall': 0.7522416009838448,
  'test_f1': 0.39399472807146213},
 {'roc_auc': 0.7541227461858531,
  'precision': 0.2679509632224168,
  'recall': 0.765,
  'f1': 0.3968871595330739})

In [51]:
rf = Pipeline(steps = [
    ('prep', preprocess),
    ('model', RandomForestClassifier(
        n_estimators = 400,
        max_depth = None,
        min_samples_split = 2,
        min_samples_leaf = 1,
        random_state = RANDOM_STATE,
        n_jobs = 1,
        class_weight = 'balanced_subsample'
    ))
])

In [52]:
rf_cv = cv_scores(rf, X_train,y_train)

In [53]:
rf_valid = evaluate_on_valid(rf,X_train,y_train,X_valid,y_valid)

In [54]:
rf_cv, rf_valid

({'test_roc_auc': 0.7692212399830352,
  'test_precision': 0.55765516733009,
  'test_recall': 0.08905472636815921,
  'test_f1': 0.15320829661236388},
 {'roc_auc': 0.7785396440129451,
  'precision': 0.5,
  'recall': 0.085,
  'f1': 0.1452991452991453})

In [57]:
xgb = Pipeline(steps = [
    ('prep', preprocess),
    ('model', XGBClassifier(
        n_estimators = 500,
        learning_rate = 0.05,
        max_depth = 4,
        subsample = 0.9,
        colsample_bytree = 0.9,
        reg_lambda = 1.0,
        random_state = RANDOM_STATE,
        eval_metric = 'logloss'
    ))
])

In [58]:
xgb_cv = cv_scores(xgb, X_train,y_train)

In [59]:
xgb_valid = evaluate_on_valid(xgb,X_train,y_train,X_valid,y_valid)

In [60]:
xgb_cv, xgb_valid

({'test_roc_auc': 0.7740940373056101,
  'test_precision': 0.506869939271255,
  'test_recall': 0.13095757169210126,
  'test_f1': 0.20736899792028982},
 {'roc_auc': 0.7806426259824318,
  'precision': 0.40625,
  'recall': 0.0975,
  'f1': 0.15725806451612903})

In [61]:
rows = []

In [62]:
def add_row(model_name,cv_dict,valid_dict):
    rows.append({
        'model':model_name,
        'cv_roc_auc':cv_dict.get('test_roc_auc', np.nan),
        'cv_precision': cv_dict.get('test_precision',np.nan),
        'cv_recall': cv_dict.get('test_recall', np.nan),
        'cv_f1': cv_dict.get('test_f1',np.nan),
        'valid_roc_auc': valid_dict.get('roc_auc',np.nan),
        'valid_precision': valid_dict.get('precision',np.nan),
        'valid_recall': valid_dict.get('recall', np.nan),
        'valid_f1': valid_dict.get('f1', np.nan),
    })

In [64]:
add_row('LogisticRegression (baseline)', log_cv,log_valid)

In [65]:
add_row('RandomForest', rf_cv,rf_valid)

In [66]:
add_row('XGBoost', xgb_cv, xgb_valid)

In [67]:
perf_df = pd.DataFrame(rows).sort_values('valid_roc_auc', ascending = False)

In [68]:
perf_df

Unnamed: 0,model,cv_roc_auc,cv_precision,cv_recall,cv_f1,valid_roc_auc,valid_precision,valid_recall,valid_f1
2,XGBoost,0.774094,0.50687,0.130958,0.207369,0.780643,0.40625,0.0975,0.157258
1,RandomForest,0.769221,0.557655,0.089055,0.153208,0.77854,0.5,0.085,0.145299
0,LogisticRegression (baseline),0.754038,0.266943,0.752242,0.393995,0.754123,0.267951,0.765,0.396887


In [70]:
best_model_name = perf_df.iloc[0]['model']

In [71]:
best_model_name

'XGBoost'

In [72]:
name_to_model = {
    'LogisticRegression (baseline)': log_reg,
    'RandomForest': rf,
    'XGBoost': xgb
}

In [73]:
best_model = name_to_model[best_model_name]

In [75]:
#valid'e göre seçilen modeli fit etmek day 26

In [77]:
# sadece train ile fit edip artifact bırakmak için
#best_model.fit(X_train,y_train)

## Business Insight & Modeling Summary (Day 25)

The modeling phase focused on comparing a simple baseline model with more expressive tree-based models to understand both predictive performance and business trade-offs.

The baseline Logistic Regression model achieved high recall, indicating strong ability to identify users with potential purchase intent. However, this came at the cost of low precision, meaning a large number of non-purchasing users would be incorrectly targeted. From a business perspective, this approach prioritizes *not missing potential buyers* but may increase marketing or operational costs due to false positives.

Tree-based models (Random Forest and XGBoost) demonstrated stronger ranking capability, reflected in higher ROC-AUC scores. Among them, **XGBoost achieved the best overall discrimination performance**, making it the most suitable candidate for ranking users by purchase likelihood. At the default decision threshold, these models were more conservative, resulting in lower recall but higher precision.

This behavior highlights an important insight: **model quality and decision policy are separate concerns**. While XGBoost provides the most informative probability estimates, the final business outcome depends on how the decision threshold is set.

Based on these results, XGBoost is selected as the candidate model for the next stage. In Day 26, the focus will shift to threshold optimization, confusion matrix analysis, and cost-sensitive evaluation to align model decisions with business objectives such as conversion uplift, marketing efficiency, and risk tolerance.
