In [171]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [120]:
def get_transaction_data(df):

    #Normalize columns names
    df.columns = df.columns.str.lower().str.replace('- ','').str.replace(' ','_').str.replace('?','')
    #Add a date column
    df['transaction_date'] = pd.to_datetime(df[['year','month','day']])
    df['year_month'] = df['year'].astype(str)+'-'+df['month'].astype(str)

    #Remove $ from amount
    df['amount'] = df['amount'].str.replace('$','').astype(float)

    #Fill missing values
    df['zip'] = df['zip'].fillna(-1).astype(int)
    df['merchant_state'] = df['merchant_state'].fillna('No State')
    df['errors'] = df['errors'].fillna('No Errors')

    #Change fraud flag to numeric
    df['is_fraud'] = np.where(df['is_fraud']=='Yes', 1, 0)

    df['us_state'] = np.where(df['merchant_state'].str.len()==2, 1, 0)

    df['day_of_week'] = df['transaction_date'].dt.dayofweek

    df['hour_of_day'] = df['time'].apply(lambda x: int(x.split(':')[0]))

    #Rearrange columns
    df = df[['user', 'card', 'transaction_date', 'year_month','year', 'month', 'day', 'time', 'day_of_week','hour_of_day', 'amount', 'use_chip',
       'merchant_name', 'merchant_city', 'merchant_state', 'zip', 'us_state', 'mcc',
       'errors', 'is_fraud']]


    return df

def get_user_data(df):

    #Normalize Columns
    df.columns = df.columns.str.lower().str.replace('- ','').str.replace(' ','_')

    #Create a user id
    df['user_id'] = df.index

    #Remove person and address columns
    df = df.drop(['person','address'], axis=1)

    #Make dollar amounts floats
    df['per_capita_income_zipcode'] = df['per_capita_income_zipcode'].str.replace('$','').astype(float)
    df['yearly_income_person'] = df['yearly_income_person'].str.replace('$','').astype(float)
    df['total_debt'] = df['total_debt'].str.replace('$','').astype(float)

    #Calculate debt to income and income to per capita income
    df['dti'] = df['total_debt']/df['yearly_income_person']
    df['itpci'] = df['yearly_income_person']/df['per_capita_income_zipcode']

    df['apartment'] = np.where(df['apartment'].isna(), 0, 1)

    #Rearrange columns
    df = df[['user_id', 'current_age', 'gender', 'apartment', 'city', 'state', 'zipcode','per_capita_income_zipcode', 'yearly_income_person', 'total_debt',
        'fico_score', 'num_credit_cards', 'dti','itpci']]

    return df

def get_cards_data(df):

    df.columns = df.columns.str.lower().str.replace('- ','').str.replace(' ','_')
    df['credit_limit'] = df['credit_limit'].str.replace('$','').astype(float)
    
    df['has_chip'] = np.where(df['has_chip']=='YES', 1,0)

    

    df = df[['user', 'card_index', 'card_brand', 
             'card_type', 'has_chip', 'cards_issued', 
             'credit_limit','year_pin_last_changed']]
    
    return df

def get_final_df(transactions,users,cards):

    final_df = pd.merge(left=transactions,right=users,how='inner',left_on='user',right_on='user_id')
    final_df = pd.merge(left=final_df,right=cards,how='inner',left_on=['user','card'],right_on=['user','card_index'])

    final_df['same_state_flag'] = np.where(final_df['merchant_state']==final_df['state'], 1, 0) 

    final_df['user_transaction_count'] = final_df.groupby('user')['is_fraud'].transform('count')

    final_df['merchant_count'] = final_df.groupby('merchant_name')['is_fraud'].transform('count')

    final_df['total_amount'] = final_df.groupby('user')['amount'].transform('sum')
    final_df['average_amount'] = final_df.groupby('user')['amount'].transform('mean')

    final_df['days_since_last_transaction'] = (final_df['transaction_date'].max() - final_df['transaction_date']).dt.days

    final_df['amount_change'] = final_df.groupby('user_id')['amount'].pct_change()
    final_df['amount_change'] = final_df['amount_change'].fillna(0)
    max_val = final_df[final_df['amount_change'] != np.inf]['amount_change'].max()
    final_df['amount_change'] = final_df['amount_change'].replace(np.inf,max_val)


    final_cols = ['year', 'month', 'day_of_week', 'hour_of_day', 'amount','use_chip',
                'merchant_name', 'merchant_city', 'merchant_state', 'zip', 'mcc','errors', 
                'same_state_flag','is_fraud']
    
    final_df = final_df[final_cols]

    return final_df

In [121]:

cards_df = get_cards_data(pd.read_csv('./data/sd254_cards.csv'))
users_df = get_user_data(pd.read_csv('./data/sd254_users.csv'))
transaction_df = get_transaction_data(pd.read_csv('./data/small_data.csv'))
final_df = get_final_df(transaction_df,users_df,cards_df)

final_df['use_chip'] = LabelEncoder().fit_transform(final_df['use_chip'])
final_df['merchant_city'] = LabelEncoder().fit_transform(final_df['merchant_city'])
final_df['merchant_state'] =LabelEncoder().fit_transform(final_df['merchant_state'])
final_df['errors'] = LabelEncoder().fit_transform(final_df['errors'])


#use_chip_enc = pd.get_dummies(final_df[['use_chip']],dtype=int)
#card_type_enc = pd.get_dummies(final_df[['card_type']],dtype=int)
#cols_to_drop = ['per_capita_income_zipcode', 'yearly_income_person', 'total_debt', 'fico_score', 'num_credit_cards']

#final_df = pd.concat([use_chip_enc,card_type_enc,final_df],axis=1).drop(columns=cols_to_drop,axis=1)
#final_df = final_df.drop(columns=cols_to_drop,axis=1)

In [122]:
print(final_df.head())

print(f"The shape of the datafram is {final_df.shape}")

   year  month  day_of_week  hour_of_day  amount  use_chip  \
0  2002      9            6            6  134.09         2   
1  2002      9            6            6   38.48         2   
2  2002      9            0            6  120.34         2   
3  2002      9            0           17  128.95         2   
4  2002      9            1            6  104.71         2   

         merchant_name  merchant_city  merchant_state    zip   mcc  errors  \
0  3527213246127876953           1117               6  91750  5300      16   
1  -727612092139916043           1379               6  91754  5411      16   
2  -727612092139916043           1379               6  91754  5411      16   
3  3414527459579106770           1379               6  91754  5651      16   
4  5817218446178736267           1117               6  91750  5912      16   

   same_state_flag  is_fraud  
0                1         0  
1                1         0  
2                1         0  
3                1         0  
4  

In [143]:
samples = 40000
total_obs = final_df.shape[0]
X = final_df[final_df.columns[:-1]]
y = final_df[final_df.columns[-1]]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=samples/total_obs,
                                                    train_size=samples/total_obs,
                                                    random_state=42)

In [146]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"Original training set shape: {X_train.shape}, {y_train.shape}")
print(f"Resampled training set shape: {X_train_smote.shape}, {y_train_smote.shape}")

Original training set shape: (40000, 13), (40000,)
Resampled training set shape: (79918, 13), (79918,)


In [148]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train_smote, y_train_smote)

In [149]:
y_pred = model.predict(X_test)

In [150]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39959
           1       0.50      0.27      0.35        41

    accuracy                           1.00     40000
   macro avg       0.75      0.63      0.67     40000
weighted avg       1.00      1.00      1.00     40000



In [152]:
print(confusion_matrix(y_test, y_pred))

[[39948    11]
 [   30    11]]


In [157]:
importances = model.feature_importances_
feat_importances = pd.Series(importances, index=X_train.columns)

In [160]:
feat_importances

year               0.035315
month              0.020601
day_of_week        0.020234
hour_of_day        0.042671
amount             0.066146
use_chip           0.103008
merchant_name      0.085191
merchant_city      0.065276
merchant_state     0.067380
zip                0.253097
mcc                0.071775
errors             0.000815
same_state_flag    0.168490
dtype: float64

In [153]:
model2 = RandomForestClassifier(random_state=42)
model2.fit(X_train, y_train)

In [154]:
y_pred = model2.predict(X_test)

In [155]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39959
           1       1.00      0.12      0.22        41

    accuracy                           1.00     40000
   macro avg       1.00      0.56      0.61     40000
weighted avg       1.00      1.00      1.00     40000



In [156]:
print(confusion_matrix(y_test, y_pred))

[[39959     0]
 [   36     5]]


In [161]:
importances = model2.feature_importances_
feat_importances = pd.Series(importances, index=X_train.columns)

In [162]:
feat_importances

year               0.099020
month              0.061914
day_of_week        0.047722
hour_of_day        0.081025
amount             0.198140
use_chip           0.026179
merchant_name      0.142368
merchant_city      0.098336
merchant_state     0.061968
zip                0.034643
mcc                0.143747
errors             0.000513
same_state_flag    0.004424
dtype: float64

In [165]:
nb_model = GaussianNB()
nb_model.fit(X_train_smote,y_train_smote)
y_pred = nb_model.predict(X_test)

In [166]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.55      0.71     39959
           1       0.00      0.61      0.00        41

    accuracy                           0.55     40000
   macro avg       0.50      0.58      0.36     40000
weighted avg       1.00      0.55      0.71     40000



In [167]:
print(confusion_matrix(y_test, y_pred))

[[21925 18034]
 [   16    25]]


In [169]:
nb2_model = GaussianNB()
nb2_model.fit(X_train,y_train)
y_pred = nb2_model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39959
           1       0.00      0.00      0.00        41

    accuracy                           1.00     40000
   macro avg       0.50      0.50      0.50     40000
weighted avg       1.00      1.00      1.00     40000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [170]:
print(confusion_matrix(y_test, y_pred))


[[39959     0]
 [   41     0]]


In [175]:
lr_model = LogisticRegression()
lr_model.fit(X_train_smote,y_train_smote)
y_pred = lr_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.58      0.74     39959
           1       0.00      0.59      0.00        41

    accuracy                           0.58     40000
   macro avg       0.50      0.58      0.37     40000
weighted avg       1.00      0.58      0.73     40000

[[23246 16713]
 [   17    24]]


In [176]:
lr_model2 = LogisticRegression()
lr_model2.fit(X_train,y_train)
y_pred = lr_model2.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.58      0.74     39959
           1       0.00      0.59      0.00        41

    accuracy                           0.58     40000
   macro avg       0.50      0.58      0.37     40000
weighted avg       1.00      0.58      0.73     40000

[[23246 16713]
 [   17    24]]
