In [5]:
import pandas as pd
import lightgbm as lgb
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE

In [6]:
df_transactions = pd.read_csv("Data/credit_card_transactions-ibm_v2.csv",nrows=15500000)
df_transactions.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [7]:
column_names = {'Is Fraud?':'is_fraud', 'Use Chip':'use_chip', 
                'Zip':'merchant_zip', 'Errors?':'transaction_error',
                }
df_transactions.rename(columns=column_names, inplace=True)
df_transactions.columns

Index(['User', 'Card', 'Year', 'Month', 'Day', 'Time', 'Amount', 'use_chip',
       'Merchant Name', 'Merchant City', 'Merchant State', 'merchant_zip',
       'MCC', 'transaction_error', 'is_fraud'],
      dtype='object')

In [8]:
df_transactions.is_fraud.value_counts()

No     15481335
Yes       18665
Name: is_fraud, dtype: int64

In [9]:
df_transactions.User.value_counts()

486     82355
396     80749
332     70010
262     68089
1249    65644
        ...  
397        29
312        28
810        26
457        25
231        21
Name: User, Length: 1263, dtype: int64

In [10]:
df_transactions.drop(['Card', 'Year','Month','Day', 'Merchant Name','Merchant City', 'Merchant State', 'merchant_zip'], axis=1, inplace=True)
df_transactions.head()

Unnamed: 0,User,Time,Amount,use_chip,MCC,transaction_error,is_fraud
0,0,06:21,$134.09,Swipe Transaction,5300,,No
1,0,06:42,$38.48,Swipe Transaction,5411,,No
2,0,06:22,$120.34,Swipe Transaction,5411,,No
3,0,17:45,$128.95,Swipe Transaction,5651,,No
4,0,06:23,$104.71,Swipe Transaction,5912,,No


In [11]:
df_transactions.is_fraud = df_transactions.is_fraud.replace({'No':0,'Yes':1}) 
df_transactions.is_fraud.value_counts()

0    15481335
1       18665
Name: is_fraud, dtype: int64

In [12]:
df_transactions.transaction_error=df_transactions.transaction_error.fillna('NAN')
# df_transactions.transaction_error=df_transactions.transaction_error.apply(lambda value:value=='NAN')
df_transactions.transaction_error.value_counts()

NAN                                                    15251673
Insufficient Balance                                     154648
Bad PIN                                                   37590
Technical Glitch                                          30748
Bad Card Number                                            8778
Bad Expiration                                             7095
Bad CVV                                                    7078
Bad Zipcode                                                1333
Bad PIN,Insufficient Balance                                339
Insufficient Balance,Technical Glitch                       291
Bad PIN,Technical Glitch                                     83
Bad Card Number,Insufficient Balance                         77
Bad CVV,Insufficient Balance                                 58
Bad Expiration,Insufficient Balance                          45
Bad Card Number,Bad Expiration                               40
Bad Card Number,Bad CVV                 

In [13]:
df_transactions[df_transactions.is_fraud==1].transaction_error.value_counts()

NAN                                     17852
Insufficient Balance                      254
Bad PIN                                   195
Bad CVV                                   174
Bad Expiration                             74
Bad Card Number                            58
Technical Glitch                           43
Bad CVV,Insufficient Balance                4
Bad PIN,Insufficient Balance                4
Bad Card Number,Insufficient Balance        2
Bad Expiration,Bad CVV                      1
Bad Expiration,Insufficient Balance         1
Bad Expiration,Technical Glitch             1
Bad PIN,Technical Glitch                    1
Bad CVV,Technical Glitch                    1
Name: transaction_error, dtype: int64

In [14]:
def convert(value):
  conv_dict = {'NAN':0, 'Insufficient Balance':1, 'Bad PIN':2, 'Bad CVV':3, 'Bad Expiration':4, 'Bad Card Number':5, 'Technical Glitch':6}
  try:
    return conv_dict[value]
  except:
    return 7
df_transactions.transaction_error = df_transactions.transaction_error.apply(convert)
df_transactions.transaction_error.value_counts()

0    15251673
1      154648
2       37590
6       30748
5        8778
4        7095
3        7078
7        2390
Name: transaction_error, dtype: int64

In [15]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15500000 entries, 0 to 15499999
Data columns (total 7 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   User               int64 
 1   Time               object
 2   Amount             object
 3   use_chip           object
 4   MCC                int64 
 5   transaction_error  int64 
 6   is_fraud           int64 
dtypes: int64(4), object(3)
memory usage: 827.8+ MB


In [16]:
df_transactions.Time = df_transactions.Time.apply(lambda value:int(value.replace(':','')))
df_transactions.Amount = df_transactions.Amount.apply(lambda value:float(value.replace('$','')))
df_transactions.head()

Unnamed: 0,User,Time,Amount,use_chip,MCC,transaction_error,is_fraud
0,0,621,134.09,Swipe Transaction,5300,0,0
1,0,642,38.48,Swipe Transaction,5411,0,0
2,0,622,120.34,Swipe Transaction,5411,0,0
3,0,1745,128.95,Swipe Transaction,5651,0,0
4,0,623,104.71,Swipe Transaction,5912,0,0


In [17]:
df_transactions.use_chip = df_transactions.use_chip.apply(lambda value:{'Swipe Transaction':0,
                                                                        'Chip Transaction':1, 'Online Transaction':3}[value])
df_transactions.use_chip.value_counts()

0    9738279
1    3978325
3    1783396
Name: use_chip, dtype: int64

In [18]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15500000 entries, 0 to 15499999
Data columns (total 7 columns):
 #   Column             Dtype  
---  ------             -----  
 0   User               int64  
 1   Time               int64  
 2   Amount             float64
 3   use_chip           int64  
 4   MCC                int64  
 5   transaction_error  int64  
 6   is_fraud           int64  
dtypes: float64(1), int64(6)
memory usage: 827.8 MB


In [19]:
df_user = pd.read_csv('Data/sd254_users.csv')
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Person                       2000 non-null   object 
 1   Current Age                  2000 non-null   int64  
 2   Retirement Age               2000 non-null   int64  
 3   Birth Year                   2000 non-null   int64  
 4   Birth Month                  2000 non-null   int64  
 5   Gender                       2000 non-null   object 
 6   Address                      2000 non-null   object 
 7   Apartment                    528 non-null    float64
 8   City                         2000 non-null   object 
 9   State                        2000 non-null   object 
 10  Zipcode                      2000 non-null   int64  
 11  Latitude                     2000 non-null   float64
 12  Longitude                    2000 non-null   float64
 13  Per Capita Income 

In [20]:
df_user.drop(['Person','Birth Year','Birth Month','Address','Apartment',
              'City','State','Latitude','Longitude','Per Capita Income - Zipcode','Num Credit Cards'], axis=1, inplace=True)
df_user.columns

Index(['Current Age', 'Retirement Age', 'Gender', 'Zipcode',
       'Yearly Income - Person', 'Total Debt', 'FICO Score'],
      dtype='object')

In [21]:
df_user.rename(columns={'Current Age':'age', 'Retirement Age':'ret_age', 'Yearly Income - Person':'yearly_income',
                        'Total Debt':'total_debt','FICO Score':'fico_score'}, inplace=True)

In [22]:
df_user.head()

Unnamed: 0,age,ret_age,Gender,Zipcode,yearly_income,total_debt,fico_score
0,53,66,Female,91750,$59696,$127613,787
1,53,68,Female,11363,$77254,$191349,701
2,81,67,Female,91792,$33483,$196,698
3,63,63,Female,10069,$249925,$202328,722
4,43,70,Male,94117,$109687,$183855,675


In [23]:
df_user.Gender = df_user.Gender.apply(lambda value:{'Male':0,'Female':1}[value])
df_user.yearly_income = df_user.yearly_income.apply(lambda value:int(value.replace('$','')))
df_user.total_debt = df_user.total_debt.apply(lambda value:int(value.replace('$','')))
df_user.head()

Unnamed: 0,age,ret_age,Gender,Zipcode,yearly_income,total_debt,fico_score
0,53,66,1,91750,59696,127613,787
1,53,68,1,11363,77254,191349,701
2,81,67,1,91792,33483,196,698
3,63,63,1,10069,249925,202328,722
4,43,70,0,94117,109687,183855,675


In [24]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   age            2000 non-null   int64
 1   ret_age        2000 non-null   int64
 2   Gender         2000 non-null   int64
 3   Zipcode        2000 non-null   int64
 4   yearly_income  2000 non-null   int64
 5   total_debt     2000 non-null   int64
 6   fico_score     2000 non-null   int64
dtypes: int64(7)
memory usage: 109.5 KB


In [25]:
df_user['Index'] = df_user.index

In [26]:
merged_inner = pd.merge(left=df_transactions, right=df_user, left_on='User', right_on='Index')

In [27]:
merged_inner.head()

Unnamed: 0,User,Time,Amount,use_chip,MCC,transaction_error,is_fraud,age,ret_age,Gender,Zipcode,yearly_income,total_debt,fico_score,Index
0,0,621,134.09,0,5300,0,0,53,66,1,91750,59696,127613,787,0
1,0,642,38.48,0,5411,0,0,53,66,1,91750,59696,127613,787,0
2,0,622,120.34,0,5411,0,0,53,66,1,91750,59696,127613,787,0
3,0,1745,128.95,0,5651,0,0,53,66,1,91750,59696,127613,787,0
4,0,623,104.71,0,5912,0,0,53,66,1,91750,59696,127613,787,0


In [28]:
df = merged_inner.drop(['User', 'Index'], axis=1)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15500000 entries, 0 to 15499999
Data columns (total 13 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Time               int64  
 1   Amount             float64
 2   use_chip           int64  
 3   MCC                int64  
 4   transaction_error  int64  
 5   is_fraud           int64  
 6   age                int64  
 7   ret_age            int64  
 8   Gender             int64  
 9   Zipcode            int64  
 10  yearly_income      int64  
 11  total_debt         int64  
 12  fico_score         int64  
dtypes: float64(1), int64(12)
memory usage: 1.6 GB


In [None]:
y=df.is_fraud 
df.drop('is_fraud', axis=1, inplace=True)
X=df

In [None]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

In [None]:
y.value_counts()

is_fraud
0           15481335
1           15481335
dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify=y)

In [None]:
model = CatBoostClassifier()
model.fit(X_train, y_train,plot=True) 
y_pred = model.predict(X_test)   
print(classification_report(y_test, y_pred))  

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.5
0:	learn: 0.4663000	total: 6.37s	remaining: 1h 46m 7s
1:	learn: 0.4107490	total: 11.3s	remaining: 1h 33m 48s
2:	learn: 0.3610566	total: 16s	remaining: 1h 28m 44s
3:	learn: 0.3284013	total: 22.9s	remaining: 1h 35m 10s
4:	learn: 0.3029885	total: 27.2s	remaining: 1h 30m 9s
5:	learn: 0.2804722	total: 31.8s	remaining: 1h 27m 54s
6:	learn: 0.2656735	total: 38.1s	remaining: 1h 30m 1s
7:	learn: 0.2485244	total: 42.2s	remaining: 1h 27m 14s
8:	learn: 0.2425178	total: 46.5s	remaining: 1h 25m 15s
9:	learn: 0.2376778	total: 53.2s	remaining: 1h 27m 43s
10:	learn: 0.2323761	total: 57.4s	remaining: 1h 26m 5s
11:	learn: 0.2289995	total: 1m 2s	remaining: 1h 25m 6s
12:	learn: 0.2147502	total: 1m 8s	remaining: 1h 26m 25s
13:	learn: 0.2112499	total: 1m 12s	remaining: 1h 25m 5s
14:	learn: 0.2007850	total: 1m 18s	remaining: 1h 25m 48s
15:	learn: 0.1968905	total: 1m 23s	remaining: 1h 25m 51s
16:	learn: 0.1903314	total: 1m 28s	remaining: 1h 25m 3s
17:	learn: 0.1846520	total: 1m 35s	rem

In [None]:
from sklearn.metrics import f1_score,roc_auc_score
print(f1_score(y_test, y_pred)) 
print(roc_auc_score(y_test, y_pred))

0.9879249924684743
0.9879351489130821


In [None]:
model.save_model('model/model.h5')