# Library

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm
import matplotlib.pyplot as plt

# Data Preprocessing

In [2]:
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')

test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')
test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')

In [3]:
train_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
train_identity.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [5]:
train_data = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
train_data.shape

(590540, 434)

In [6]:
test_data = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
test_data.shape

(506691, 433)

In [7]:
train_data.describe()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
count,590540.0,590540.0,590540.0,590540.0,590540.0,581607.0,588975.0,586281.0,524834.0,524834.0,...,139369.0,45113.0,139318.0,139261.0,5159.0,5169.0,4747.0,5132.0,5163.0,77586.0
mean,3282270.0,0.03499,7372311.0,135.027176,9898.734658,362.555488,153.194925,199.278897,290.733794,86.80063,...,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,12.800927,329.608924,149.070308,26.508597
std,170474.4,0.183755,4617224.0,239.162522,4901.170153,157.793246,11.336444,41.244453,101.741072,2.690623,...,30.37536,1.561302,141.095343,152.160327,198.847038,6.897665,2.372447,97.461089,32.101995,3.737502
min,2987000.0,0.0,86400.0,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,...,100.0,10.0,100.0,100.0,100.0,10.0,11.0,100.0,100.0,0.0
25%,3134635.0,0.0,3027058.0,43.321,6019.0,214.0,150.0,166.0,204.0,87.0,...,166.0,13.0,266.0,256.0,252.0,14.0,11.0,321.0,119.0,24.0
50%,3282270.0,0.0,7306528.0,68.769,9678.0,361.0,150.0,226.0,299.0,87.0,...,166.0,15.0,341.0,472.0,252.0,14.0,11.0,321.0,149.0,24.0
75%,3429904.0,0.0,11246620.0,125.0,14184.0,512.0,150.0,226.0,330.0,87.0,...,225.0,15.0,427.0,533.0,486.5,14.0,15.0,371.0,169.0,32.0
max,3577539.0,1.0,15811130.0,31937.391,18396.0,600.0,231.0,237.0,540.0,102.0,...,229.0,29.0,671.0,661.0,854.0,44.0,26.0,548.0,216.0,32.0


In [8]:
def standardize_column_names(dataframe):
    dataframe.columns = dataframe.columns.str.replace('-', '_', regex=False)
    return dataframe

train_data_standardized = standardize_column_names(train_data)
test_data_standardized = standardize_column_names(test_data)

In [59]:
train_data['P_emaildomain'].value_counts()

P_emaildomain
gmail.com           228355
yahoo.com           100934
hotmail.com          45250
anonymous.com        36998
aol.com              28289
comcast.net           7888
icloud.com            6267
outlook.com           5096
msn.com               4092
att.net               4033
live.com              3041
sbcglobal.net         2970
verizon.net           2705
ymail.com             2396
bellsouth.net         1909
yahoo.com.mx          1543
me.com                1522
cox.net               1393
optonline.net         1011
charter.net            816
live.com.mx            749
rocketmail.com         664
mail.com               559
earthlink.net          514
gmail                  496
outlook.es             438
mac.com                436
juno.com               322
aim.com                315
hotmail.es             305
roadrunner.com         305
windstream.net         305
hotmail.fr             295
frontier.com           280
embarqmail.com         260
web.de                 240
netzero.com   

In [63]:
train_data['card5'].value_counts()

card5
226.0    296546
224.0     81513
166.0     57140
102.0     29105
117.0     25941
          ...  
173.0         1
201.0         1
221.0         1
234.0         1
196.0         1
Name: count, Length: 119, dtype: int64

In [9]:
# Check for missing values
missing_values = train_data_standardized.isnull().sum()
print(missing_values[missing_values > 0])

card2           8933
card3           1565
card4           1577
card5           4259
card6           1571
               ...  
id_36         449555
id_37         449555
id_38         449555
DeviceType    449730
DeviceInfo    471874
Length: 414, dtype: int64


## Feature Selection

In [10]:
# A lot of columns have missing values nearly for all rows, will remove features with >90% missing values
def high_missing_values(dataframe, threshold=0.9):
    return [
        col for col in dataframe.columns 
        if dataframe[col].isnull().sum() / dataframe.shape[0] > threshold
    ]

remove_cols_train = high_missing_values(train_data_standardized)
remove_cols_test = high_missing_values(test_data_standardized)

merge_rm_cols = list(set(remove_cols_train + remove_cols_test))

print("Removed columns:", merge_rm_cols)

Removed columns: ['id_21', 'id_26', 'id_18', 'id_27', 'dist2', 'id_24', 'id_08', 'id_25', 'id_23', 'id_22', 'D7', 'id_07']


In [11]:
train_data_cleaned = train_data_standardized.drop(merge_rm_cols, axis=1)
test_data_cleaned = test_data_standardized.drop(merge_rm_cols, axis=1)

In [12]:
y = train_data_cleaned['isFraud']
train_data_cleaned = train_data_cleaned.drop(columns=['isFraud'])

In [13]:
# Impute missing values
# Separate numerical and categorical columns
numerical_cols = train_data_cleaned.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = train_data_cleaned.select_dtypes(include=['object']).columns

# Impute numerical columns with mean and categorical columns with mode
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

In [14]:
train_data_cleaned[numerical_cols] = num_imputer.fit_transform(train_data_cleaned[numerical_cols])
train_data_cleaned[categorical_cols] = cat_imputer.fit_transform(train_data_cleaned[categorical_cols])

test_data_cleaned[numerical_cols] = num_imputer.transform(test_data_cleaned[numerical_cols])
test_data_cleaned[categorical_cols] = cat_imputer.transform(test_data_cleaned[categorical_cols])

In [15]:
for col in tqdm(train_data_cleaned.columns): 
    if col in categorical_cols:
        le = LabelEncoder()
        le.fit(list(train_data_cleaned[col].astype(str).values) + list(test_data_cleaned[col].astype(str).values))
        train_data_cleaned[col] = le.transform(train_data_cleaned[col].astype(str).values)
        test_data_cleaned[col] = le.transform(test_data_cleaned[col].astype(str).values)

100%|██████████| 421/421 [00:41<00:00, 10.04it/s]


# Modeling

In [42]:
X = train_data_cleaned 
X_test = test_data_cleaned

In [28]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "LightGBM": lgb.LGBMClassifier(n_estimators=100, random_state=42, device='gpu'),
    "XGBoost": xgb.XGBClassifier(n_estimators=100, random_state=42, tree_method = "hist", device = "cuda")  
}


In [29]:
y.value_counts()

isFraud
0    569877
1     20663
Name: count, dtype: int64

In [33]:
X_sampled = X.sample(frac=0.15, random_state=42)
y_sampled = y.sample(frac=0.15, random_state=42)

y_sampled.value_counts()

isFraud
0    85430
1     3151
Name: count, dtype: int64

In [34]:
cv_results = {}

for model_name, model in tqdm(models.items(), desc="Evaluating Models"):
    cv_scores = cross_val_score(model, X_sampled, y_sampled, cv=5, scoring='roc_auc')
    
    cv_results[model_name] = cv_scores
    
    print(f'{model_name} - Cross-Validation ROC AUC Scores: {cv_scores}')
    print(f'{model_name} - Mean Cross-Validation ROC AUC: {cv_scores.mean():.4f}\n')

Evaluating Models:  33%|███▎      | 1/3 [03:37<07:14, 217.35s/it]

Random Forest - Cross-Validation ROC AUC Scores: [0.89188682 0.88975222 0.88133708 0.90101053 0.89591107]
Random Forest - Mean Cross-Validation ROC AUC: 0.8920

[LightGBM] [Info] Number of positive: 2520, number of negative: 68344
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32343
[LightGBM] [Info] Number of data points in the train set: 70864, number of used features: 419
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 72 dense feature groups (4.87 MB) transferred to GPU in 0.005766 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035561 -> initscore=-3.300295
[LightGBM] [Info] Start training from score -3.300295
[LightGBM] [Info] Number of positive: 2521, number of negative: 68344
[LightGBM] [Info] This is the GPU trainer!!
[LightG

Evaluating Models:  67%|██████▋   | 2/3 [04:04<01:45, 105.46s/it]

LightGBM - Cross-Validation ROC AUC Scores: [0.90667784 0.9145149  0.90698353 0.9255018  0.91878239]
LightGBM - Mean Cross-Validation ROC AUC: 0.9145



Evaluating Models: 100%|██████████| 3/3 [04:18<00:00, 86.16s/it] 

XGBoost - Cross-Validation ROC AUC Scores: [0.9088031  0.91340947 0.90800776 0.92559099 0.91669305]
XGBoost - Mean Cross-Validation ROC AUC: 0.9145






In [37]:
for model_name, scores in cv_results.items():
    print(f'{model_name} - Mean ROC AUC: {scores.mean():.7f}')


Random Forest - Mean ROC AUC: 0.8919795
LightGBM - Mean ROC AUC: 0.9144921
XGBoost - Mean ROC AUC: 0.9145009


## train all data

In [45]:
models2 = {
    "LightGBM": lgb.LGBMClassifier(n_estimators=100, random_state=42, device='gpu', max_bin=255, n_jobs=-1),
    "XGBoost": xgb.XGBClassifier(n_estimators=100, random_state=42, tree_method = "hist", device = "cuda",n_jobs=-1)  
}

cv_results2 = {}

for model_name, model in tqdm(models2.items(), desc="Evaluating Models"):
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    
    cv_results[model_name] = cv_scores
    
    print(f'{model_name} - Cross-Validation ROC AUC Scores: {cv_scores}')
    print(f'{model_name} - Mean Cross-Validation ROC AUC: {cv_scores.mean():.4f}\n')

Evaluating Models:   0%|          | 0/2 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 16531, number of negative: 455901
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 35991
[LightGBM] [Info] Number of data points in the train set: 472432, number of used features: 420
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 60 dense feature groups (27.03 MB) transferred to GPU in 0.026701 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034991 -> initscore=-3.317038
[LightGBM] [Info] Start training from score -3.317038
[LightGBM] [Info] Number of positive: 16531, number of negative: 455901
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 37563
[LightGBM] [Info] Number of data points in the train set: 472432, number of used features: 421
[LightGBM] [Info] Using GPU Devi

[LightGBM] [Fatal] bin size 257 cannot run on GPU


[LightGBM] [Info] Number of positive: 16530, number of negative: 455902
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 32952
[LightGBM] [Info] Number of data points in the train set: 472432, number of used features: 420
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 77 dense feature groups (36.04 MB) transferred to GPU in 0.033081 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034989 -> initscore=-3.317101
[LightGBM] [Info] Start training from score -3.317101


1 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 1187, in fit
    super().fit(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/sklearn.py", line 885, in fit
    self._Booster = train(
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/engine.py", line 255, in train
    booster = Booster(params=params, train_set=train_set)
  File "/opt/conda/lib/python3.10/site-packages/lightgbm/

LightGBM - Cross-Validation ROC AUC Scores: [0.88036597 0.82397569 0.86492276        nan 0.89394311]
LightGBM - Mean Cross-Validation ROC AUC: nan



Evaluating Models: 100%|██████████| 2/2 [02:52<00:00, 86.46s/it] 

XGBoost - Cross-Validation ROC AUC Scores: [0.84184598 0.76636162 0.88105533 0.92493564 0.90544278]
XGBoost - Mean Cross-Validation ROC AUC: 0.8639






# Submission

In [48]:
sample_submission = pd.read_csv('/kaggle/input/ieee-fraud-detection/sample_submission.csv')

In [49]:
sample_submission

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5
2,3663551,0.5
3,3663552,0.5
4,3663553,0.5
...,...,...
506686,4170235,0.5
506687,4170236,0.5
506688,4170237,0.5
506689,4170238,0.5


In [51]:
# lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42, device='gpu', max_bin=255, n_jobs=-1)
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, tree_method = "hist", device = "cuda", n_jobs=-1)

xgb_model.fit(X, y)

# lgb_preds = lgb_model.predict_proba(X_test)[:, 1]  
xgb_preds = xgb_model.predict_proba(X_test)[:, 1]

In [54]:
sample_submission['isFraud'] = xgb_preds
sample_submission


Unnamed: 0,TransactionID,isFraud
0,3663549,0.003574
1,3663550,0.015365
2,3663551,0.017868
3,3663552,0.002045
4,3663553,0.002323
...,...,...
506686,4170235,0.018111
506687,4170236,0.011219
506688,4170237,0.007742
506689,4170238,0.013825


In [57]:
sample_submission.to_csv('submission.csv', index=False)

In [70]:
import pickle

with open('xgb_model.pkl', 'wb') as model_file:
    pickle.dump(xgb_model, model_file)