In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb

In [2]:
def get_transaction_data(df):

    #Normalize columns names
    df.columns = df.columns.str.lower().str.replace('- ','').str.replace(' ','_').str.replace('?','')
    #Add a date column
    df['transaction_date'] = pd.to_datetime(df[['year','month','day']])
    df['year_month'] = df['year'].astype(str)+'-'+df['month'].astype(str)

    #Remove $ from amount
    df['amount'] = df['amount'].str.replace('$','').astype(float)

    df['amount_bin'] = pd.cut(df['amount'], bins=[0, 50, 100, 500, 1000, float('inf')], labels=[1, 2, 3, 4, 5])


    #Fill missing values
    df['zip'] = df['zip'].fillna(-1).astype(int)
    df['merchant_state'] = df['merchant_state'].fillna('No State')
    df['errors'] = df['errors'].fillna('No Errors')

    #Change fraud flag to numeric
    df['is_fraud'] = np.where(df['is_fraud']=='Yes', 1, 0)

    df['us_state'] = np.where(df['merchant_state'].str.len()==2, 1, 0)

    df['day_of_week'] = df['transaction_date'].dt.dayofweek

    df['is_weekend'] = pd.to_datetime(df['day_of_week']).isin([5, 6]).astype(int)

    df['hour_of_day'] = df['time'].apply(lambda x: int(x.split(':')[0]))

    #Rearrange columns
    df = df[['user', 'card', 'transaction_date', 'year_month','year', 'month', 'day', 'time', 'day_of_week', 'is_weekend', 'hour_of_day', 'amount', 'amount_bin', 'use_chip',
       'merchant_name', 'merchant_city', 'merchant_state', 'zip', 'us_state', 'mcc',
       'errors', 'is_fraud']]


    return df

def get_user_data(df):

    #Normalize Columns
    df.columns = df.columns.str.lower().str.replace('- ','').str.replace(' ','_')

    #Create a user id
    df['user_id'] = df.index

    #Remove person and address columns
    df = df.drop(['person','address'], axis=1)

    #Make dollar amounts floats
    df['per_capita_income_zipcode'] = df['per_capita_income_zipcode'].str.replace('$','').astype(float)
    df['yearly_income_person'] = df['yearly_income_person'].str.replace('$','').astype(float)
    df['total_debt'] = df['total_debt'].str.replace('$','').astype(float)

    #Calculate debt to income and income to per capita income
    df['dti'] = df['total_debt']/df['yearly_income_person']
    df['itpci'] = df['yearly_income_person']/df['per_capita_income_zipcode']

    df['apartment'] = np.where(df['apartment'].isna(), 0, 1)

    #Rearrange columns
    df = df[['user_id', 'current_age', 'gender', 'apartment', 'city', 'state', 'zipcode','per_capita_income_zipcode', 'yearly_income_person', 'total_debt',
        'fico_score', 'num_credit_cards', 'dti','itpci']]

    return df

def get_cards_data(df):

    df.columns = df.columns.str.lower().str.replace('- ','').str.replace(' ','_')
    df['credit_limit'] = df['credit_limit'].str.replace('$','').astype(float)
    
    df['has_chip'] = np.where(df['has_chip']=='YES', 1,0)

    

    df = df[['user', 'card_index', 'card_brand', 
             'card_type', 'has_chip', 'cards_issued', 
             'credit_limit','year_pin_last_changed']]
    
    return df

def get_final_df(transactions,users,cards):

    final_df = pd.merge(left=transactions,right=users,how='inner',left_on='user',right_on='user_id')
    final_df = pd.merge(left=final_df,right=cards,how='inner',left_on=['user','card'],right_on=['user','card_index'])

    final_df['same_state_flag'] = np.where(final_df['merchant_state']==final_df['state'], 1, 0) 

    final_df['user_transaction_count'] = final_df.groupby('user')['is_fraud'].transform('count')

    final_df['merchant_count'] = final_df.groupby('merchant_name')['is_fraud'].transform('count')

    final_df['total_amount'] = final_df.groupby('user')['amount'].transform('sum')
    final_df['average_amount'] = final_df.groupby('user')['amount'].transform('mean')

    final_df['days_since_last_transaction'] = (final_df['transaction_date'].max() - final_df['transaction_date']).dt.days

    final_df['amount_change'] = final_df.groupby('user_id')['amount'].pct_change()
    final_df['amount_change'] = final_df['amount_change'].fillna(0)
    max_val = final_df[final_df['amount_change'] != np.inf]['amount_change'].max()
    final_df['amount_change'] = final_df['amount_change'].replace(np.inf,max_val)


    final_cols = ['user','year', 'month','day_of_week','is_weekend', 'hour_of_day', 'amount', 'amount_bin', 'use_chip', 'merchant_name','merchant_city','merchant_state',
                'zip', 'mcc', 'card_type', 'has_chip','errors','same_state_flag','us_state',
                'user_transaction_count','merchant_count','total_amount','average_amount','days_since_last_transaction',
                'amount_change','is_fraud']
    
    final_df = final_df[final_cols]

    return final_df


In [3]:
cards_df = get_cards_data(pd.read_csv('./data/sd254_cards.csv'))
users_df = get_user_data(pd.read_csv('./data/sd254_users.csv'))
transaction_df = get_transaction_data(pd.read_csv('./data/credit_card_transactions-ibm_v2.csv'))
final_df = get_final_df(transaction_df,users_df,cards_df)

In [5]:
final_cols = ['mcc_encoded',
 'merchant_name',
 'amount',
 'merchant_city_encoded',
 'year',
 'user',
 'hour_of_day',
 'merchant_state_encoded',
 'month',
 'zip_encoded']

final_columns = ['user', 'year', 'month', 'day_of_week', 'is_weekend', 'hour_of_day', 'amount', 'amount_bin', 'use_chip', 
 'merchant_name', 'merchant_city', 'merchant_state', 'zip', 'mcc','errors', 'same_state_flag','is_fraud']

In [6]:
label_encoders = {}
final_df = final_df[final_columns]
final_df = pd.get_dummies(final_df,columns=['use_chip'], dtype=int)

final_columns = ['user', 'year', 'month', 'day_of_week', 'is_weekend', 'hour_of_day', 'amount', 'amount_bin', 'use_chip', 
 'merchant_name', 'merchant_city', 'merchant_state', 'zip', 'mcc','errors', 'same_state_flag','is_fraud']


encode_columns = ['merchant_city','merchant_state','zip','mcc','errors']

for col in encode_columns:
    label_encoders[col] = LabelEncoder()
    final_df[col + '_encoded'] = label_encoders[col].fit_transform(final_df[col])

final_df = final_df.drop(columns=encode_columns,axis=1)

X = final_df[[col for col in final_df.columns if col != 'is_fraud']]
y = final_df['is_fraud']

In [10]:
samples = 1_000_000/final_df.shape[0]

In [11]:
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(X[final_cols], y, test_size=samples,train_size=samples,stratify=y, random_state=42)

scaler = StandardScaler()
X_train_small_scaled = scaler.fit_transform(X_train_small)
X_test_small_scaled = scaler.transform(X_test_small)

In [12]:
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300, 400, 500],
    'min_child_weight': [1, 5, 10],
}

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, 
                                   n_iter=100, cv=3, verbose=2, random_state=42, 
                                   n_jobs=-1, scoring='roc_auc')
random_search.fit(X_train_small_scaled, y_train_small)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  _data = np.array(data, dtype=dtype, copy=copy,
Parameters: { "use_label_encoder" } are not used.



In [13]:
best_xgb = random_search.best_estimator_

y_pred = best_xgb.predict(X_test_small_scaled)

print("Best parameters found: ", random_search.best_params_)
print("\nClassification Report:")
print(classification_report(y_test_small, y_pred))
print(confusion_matrix(y_test_small, y_pred))

Best parameters found:  {'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.1}

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    998780
           1       0.98      0.68      0.80      1220

    accuracy                           1.00   1000000
   macro avg       0.99      0.84      0.90   1000000
weighted avg       1.00      1.00      1.00   1000000

[[998762     18]
 [   391    829]]
