In [1]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, recall_score, make_scorer, roc_auc_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression

import joblib
import lightgbm as lgb

import warnings
# Suppress all future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df = pd.read_csv('../data/Base.csv')

In [3]:
df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0,0.6,0.4751,11,14,30,0.006991,-1.863101,AB,3483,...,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,...,0,200.0,0,INTERNET,3.743048,other,0,1,0,0


In [4]:
df.shape

(1000000, 32)

In [5]:
SEED = 42
TARGET = 'fraud_bool'

In [6]:
X = df.drop(TARGET,axis=1)
y = df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=0.6, random_state=SEED)
X_holdout, X_test, y_holdout, y_test = train_test_split(X_test, y_test, stratify=y_test, test_size=0.5, random_state=SEED)


In [7]:
X_train.shape, X_holdout.shape, X_test.shape

((600000, 31), (200000, 31), (200000, 31))

In [8]:
# Create ColumnDropper class
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed = X_transformed.drop(self.columns_to_drop,axis=1)
        return X_transformed
    
# Create Missing values (-1 or negative) to nan transformer class
class MissingAsNan(BaseEstimator, TransformerMixin):
    def __init__(self, missing_neg1, missing_neg):
        self.missing_neg1 = missing_neg1
        self.missing_neg = missing_neg
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.missing_neg1] = X_transformed[self.missing_neg1]\
            .replace(-1, np.nan)
        X_transformed[self.missing_neg] = X_transformed[self.missing_neg]\
            .map(lambda x: np.nan if x < 0 else x)
        return X_transformed

# Create MissingFlagger class
class MissingFlagger(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_flag=None):
        self.columns_to_flag = columns_to_flag
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns_to_flag:
            X_transformed[f'MISSING_FLAG_{col}'] = X_transformed[col].isnull().astype(int)
        return X_transformed

# Create MissingValueFiller class
class MissingValueFiller(BaseEstimator, TransformerMixin):
    def __init__(self, fill_value=0):
        self.fill_value = fill_value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed = X_transformed.fillna(self.fill_value)
        return X_transformed
    
# Create IncomeRounder class
class IncomeRounder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        X_transformed['income'] = X_transformed['income'].round(1)
        return X_transformed
    
# Create Merger class to merge some of the categories of categorical features
class Merger(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        # proposed_credit_limit
        X_transformed['proposed_credit_limit'] = X_transformed['proposed_credit_limit']\
        .apply(lambda x: min(4,max(0,1+x//500))).astype('int')
        # housing_status
        X_transformed['housing_status'] = X_transformed['housing_status']\
        .apply(lambda x: 'other' if x in {'BD','BF','BG'} else x)  
        # device_os
        X_transformed['device_os'] = X_transformed['device_os']\
        .apply(lambda x: 'other' if x == 'x11' else x)
        return X_transformed

# Create CategoricalConverter class, converting dtype of categorical features to 'category'
class CategoricalConverter(BaseEstimator, TransformerMixin):
    def __init__(self, cat_columns):
        self.cat_columns = cat_columns
        self.categories_ = {}

    def fit(self, X, y=None):
        for col in self.cat_columns:
            self.categories_[col] = X[col].astype('category').cat.categories
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in self.cat_columns:
            X_transformed[col] = pd.Categorical(X_transformed[col],
                                                categories=self.categories_[col], 
                                                ordered=False)
        return X_transformed    
    
# Create CustomOneHotEncoder class for one-hot-encoding, returning a dataframe
class CustomOneHotEncoder(TransformerMixin, BaseEstimator):
    def __init__(self, ohe_columns):
        self.ohe_columns = ohe_columns
        self.ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.feature_names = None

    def fit(self, X, y=None):
        self.ohe.fit(X[self.ohe_columns].astype('category'))
        self.feature_names = list(X.columns)
        return self

    def transform(self, X):
        # One-hot encode the specified columns
        X_ohe = X[self.ohe_columns].copy()
        X_ohe = self.ohe.transform(X_ohe)
        ohe_column_names = self.ohe.get_feature_names_out(self.ohe_columns)
        X_ohe = pd.DataFrame(X_ohe, columns=ohe_column_names, index=X.index)

        # Concatenate the one-hot-encoded columns with the remaining columns
        X_transformed = pd.concat([X.drop(self.ohe_columns,axis=1), X_ohe], axis=1).copy()

        return X_transformed

# Create CustomScalar class for standardization and column name adjustment
# If no columns given, scales all
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_standardize=None):
        self.columns_to_standardize = columns_to_standardize

    def fit(self, X, y=None):
        if self.columns_to_standardize is None:
            self.columns_to_standardize = list(X.columns)
        if self.columns_to_standardize:
            self.scaler = StandardScaler()
            self.scaler.fit(X[self.columns_to_standardize])
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        if self.columns_to_standardize:
            X_transformed[self.columns_to_standardize] = self.scaler\
                .transform(X_transformed[self.columns_to_standardize])
        return X_transformed

In [9]:
cols_to_drop = ['source', 'device_fraud_count']

cols_missing_neg1 = ['prev_address_months_count', 
                     'current_address_months_count', 
                     'bank_months_count', 
                     'session_length_in_minutes', 
                     'device_distinct_emails_8w']
cols_missing_neg = ['intended_balcon_amount']

cols_to_flag = ['prev_address_months_count', 
                'intended_balcon_amount', 
                'bank_months_count']

fill_value = -1

ohe_cols = ['payment_type', 
            'employment_status', 
            'housing_status', 
            'device_os', 
            'device_distinct_emails_8w']

In [10]:
# Create the preprocessor
logreg_preprocessor = make_pipeline(ColumnDropper(cols_to_drop),
                                    MissingAsNan(cols_missing_neg1,cols_missing_neg),
                                    MissingFlagger(cols_to_flag),
                                    MissingValueFiller(fill_value),
                                    IncomeRounder(),
                                    Merger(),
                                    CustomOneHotEncoder(ohe_cols),
                                    CustomScaler()
                                   )

In [11]:
# Recall @ 5% FPR

def recall_at_5percent_fpr(y_true, y_pred_proba):
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    target_fpr = 0.05
    idx = (np.abs(fpr - target_fpr)).argmin()
    return tpr[idx]

# Create custom scorer using the custom scoring function
custom_scorer = make_scorer(recall_at_5percent_fpr, response_method='predict')

In [12]:
# Define logistic regression model
logreg_model = LogisticRegression(class_weight='balanced', random_state=SEED, n_jobs=-1)

In [13]:
# Create pipeline for logistic regression
logreg_pipeline = Pipeline([
    ('logreg_preprocessor', logreg_preprocessor),
    ('logisticregression', LogisticRegression())
])

In [14]:
# Define hyperparameter grid
param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10]}
# Create StratifiedKFold object
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

# Create GridSearchCV object
grid_search = GridSearchCV(logreg_pipeline, param_grid, cv=stratified_cv, scoring=custom_scorer, n_jobs=2)

In [15]:
# Perform grid search
grid_search.fit(X_train, y_train)

In [16]:
best_model = grid_search.best_estimator_
# Print best model parameters
print(f"Best Model Parameters:\
{best_model.named_steps['logisticregression'].get_params()}")

Best Model Parameters:{'C': 1, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [17]:
# Make probability predictions on train and test
y_pred_train_proba = best_model.predict_proba(X_train)[:, 1]
y_pred_test_proba = best_model.predict_proba(X_test)[:, 1]

In [18]:
# Cross-validation performance
cv_results = grid_search.cv_results_
print(f"\nCross-Validation Recall @ 5% FPR: {cv_results['mean_test_score'][0]:.4f}")
for i in range(stratified_cv.get_n_splits()):
    print(f"  fold_{i+1}: {cv_results[f'split{i}_test_score'][0]:.4f}")


Cross-Validation Recall @ 5% FPR: 0.0112
  fold_1: 0.0098
  fold_2: 0.0113
  fold_3: 0.0144
  fold_4: 0.0091
  fold_5: 0.0113


In [19]:
# Declic Evaluation

# def declic_eval(y_true,y_pred_proba,num_sets=10):
#     df_y = pd.DataFrame(data= {'true': y_true, 'predicted': y_pred_proba})
#     df_y = df_y.sort_values(by='predicted',ascending = False).reset_index()
#     step_size = 100 / num_sets
#     labels = [f"{i*step_size:.0f}-{(i+1)*step_size:.0f}%" for i in range(num_sets)]
#     df_y['set'] = pd.qcut(df_y.index, num_sets, labels=labels)
    
#     fig_width, fig_height = 6.4*1.5, 4.8
#     fig = plt.figure(figsize=(fig_width, fig_height))
#     ax = (df_y.groupby('set')['true'].mean()*100).plot\
#         (kind='bar',
#          title='Declic Evaluation: Fraud Rate for Deciles of Sorted Predictions'
#         )
    
#     ax.axhline(y=y_true.mean()*100, color='red', linestyle='--')
    
#     for p in ax.patches:
#         ax.annotate(f'{p.get_height():.2f}%',
#                     xy=(p.get_x() + p.get_width() / 2., p.get_height()),
#                     ha='center', va='bottom', xytext=(0,0),
#                     textcoords='offset points')

#     ax.annotate(f'Mean Fraud Rate: {y_true.mean()*100:.3f}%',
#                 xy=(fig_width*.5, y.mean()*100), xytext=(0,2),
#                 ha='left', va='bottom', color='red',
#                 textcoords='offset points')

#     ax.set_ylabel('Fraud %')
#     ax.set_xlabel('Decile of Sorted Predictions')
#     ax.xaxis.grid(False)
    
#     plt.show()
    
#     return

In [20]:
# Test performance
recall_test = recall_at_5percent_fpr(y_test, y_pred_test_proba)
print(f'\nTest Recall @ 5% FPR: {recall_test}')
roc_auc_test = roc_auc_score(y_test, y_pred_test_proba)
print(f'\nTest roc_auc_score: {roc_auc_test}')
# print(f'\nTest Declic Evaluation:')
# declic_eval(y_test,y_pred_test_proba)


Test Recall @ 5% FPR: 0.5167724388032638

Test roc_auc_score: 0.8800818334479535


In [21]:
lgbm_preprocessor = make_pipeline(ColumnDropper(cols_to_drop),
                                  MissingAsNan(cols_missing_neg1,cols_missing_neg),
                                  MissingFlagger(cols_to_flag),
                                  MissingValueFiller(fill_value),
                                  IncomeRounder(),
                                  Merger(),
                                  CategoricalConverter(ohe_cols)
                                 )

In [22]:
# Preprocess the data
X_train_processed = lgbm_preprocessor.fit_transform(X_train)
X_test_processed = lgbm_preprocessor.transform(X_test)

In [23]:
# Train/validation split with stratified sampling
X_train_lgbm, X_val_lgbm, y_train_lgbm, y_val_lgbm = train_test_split\
    (X_train_processed, y_train, test_size=0.2, random_state=SEED, stratify=y_train)

In [24]:
# Define LGBM parameters
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'num_leaves':17,
    'learning_rate': 0.05,
    'verbose': -1,
    'early_stopping_rounds': 250
}

# Create dataset for LGBM
lgb_train = lgb.Dataset(X_train_lgbm, y_train_lgbm)
lgb_val = lgb.Dataset(X_val_lgbm, y_val_lgbm)

# Train the model
model = lgb.train(params, lgb_train, num_boost_round=1000,
                  valid_sets=[lgb_train, lgb_val])

In [25]:
# # Make probability predictions on train and test
y_pred_train_proba = model.predict\
    (X_train_lgbm, num_iteration=model.best_iteration, raw_score=True)
y_pred_val_proba = model.predict\
    (X_val_lgbm, num_iteration=model.best_iteration, raw_score=True)
y_pred_test_proba = model.predict\
    (X_test_processed, num_iteration=model.best_iteration)

# Training performance
recall_train = recall_at_5percent_fpr(y_train_lgbm, y_pred_train_proba)
roc_auc_train = roc_auc_score(y_train_lgbm, y_pred_train_proba)

# Validation performance
recall_val = recall_at_5percent_fpr(y_val_lgbm, y_pred_val_proba)
roc_auc_val = roc_auc_score(y_val_lgbm, y_pred_val_proba)

# Test performance
recall_test = recall_at_5percent_fpr(y_test, y_pred_test_proba)
roc_auc_test = roc_auc_score(y_test, y_pred_test_proba)

In [26]:
# Print Recall @ 5% FPR for train, validation and test
print(f'\nRecall @ 5% FPR:\n- Validation: {recall_val}\n- Test: {recall_test}')

# Print roc_auc_score for train, validation and test
print(f'\nroc_auc_score:\n- Validation: \
    {roc_auc_val}\n- Test: {roc_auc_test}')


Recall @ 5% FPR:
- Validation: 0.5366591080876795
- Test: 0.5321849501359928

roc_auc_score:
- Validation:     0.8930685104104193
- Test: 0.8890331468976794


In [27]:
joblib.dump(best_model, 'logreg_model.pkl')
joblib.dump(lgbm_preprocessor, 'lgbm_preprocessor.pkl')
model.save_model('lgbm_model.txt')


<lightgbm.basic.Booster at 0x7f0c56cd0850>

In [28]:
loaded_logreg_model = joblib.load('logreg_model.pkl')
loaded_lgbm_preprocessor = joblib.load('lgbm_preprocessor.pkl')
loaded_lgbm_model = lgb.Booster(model_file='lgbm_model.txt')

In [29]:
log_reg_pred = loaded_logreg_model.predict_proba(X_holdout)[:,1]

In [30]:
lgbm_pred = loaded_lgbm_model.predict(loaded_lgbm_preprocessor.transform(X_holdout))

In [31]:
# Test performance
recall_test = recall_at_5percent_fpr(y_holdout, log_reg_pred)
roc_auc_test = roc_auc_score(y_holdout, log_reg_pred)
print(f'\nLogistic Regression Test Recall @ 5% FPR: {recall_test}')
print(f'\nLogistic Regression Test roc_auc_score: {roc_auc_test}')


Logistic Regression Test Recall @ 5% FPR: 0.5185856754306437

Logistic Regression Test roc_auc_score: 0.8843274477046648


In [32]:
# Test performance
recall_test = recall_at_5percent_fpr(y_holdout, lgbm_pred)
roc_auc_test = roc_auc_score(y_holdout, lgbm_pred)
print(f'\nLGBM Test Recall @ 5% FPR: {recall_test}')
print(f'\nLGBM Test roc_auc_score: {roc_auc_test}')


LGBM Test Recall @ 5% FPR: 0.5367180417044425

LGBM Test roc_auc_score: 0.8916300683666866
