In [1]:
import cudf
import os

## Download data

For this example we use the Tabformer credit card transactions dataset from IBM. You can download the Tabformer credit card transaction data from this link: https://ibm.ent.box.com/v/tabformer-data/folder/130747715605

## Data Preprocessing

In [2]:
data_path = './'

In [3]:
gdf = cudf.read_csv(os.path.join(data_path, 'card_transaction.v1.csv'))

In [4]:
gdf.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [5]:
gdf.shape

(24386900, 15)

In [6]:
 # drop irrelevant columns
gdf = gdf.drop(columns=['User', 'Card', 'Errors?', 'Year'])
gdf["Merchant City"] = gdf["Merchant City"].str.lstrip()

In [7]:
gdf['Zip'] = gdf['Zip'].astype('object')
gdf['MCC'] = gdf['MCC'].astype('object')
gdf["Merchant Name"] = gdf["Merchant Name"].astype("object")

In [8]:
#  # drop irrelevant columns
# gdf = gdf.drop(columns=['User', 'Card', 'Errors?', 'Year'])
# gdf['Zip'] = gdf['Zip'].astype('category')
# gdf['MCC'] = gdf['MCC'].astype('category')
# gdf["Merchant Name"] = gdf["Merchant Name"].astype("category")
# gdf["Merchant State"] = gdf["Merchant State"].astype("category")

In [9]:
gdf.dtypes

Month              int64
Day                int64
Time              object
Amount            object
Use Chip          object
Merchant Name     object
Merchant City     object
Merchant State    object
Zip               object
MCC               object
Is Fraud?         object
dtype: object

In [10]:
N = 500_000
SEED=42
subset_df = gdf.sample(n=N, random_state=SEED)

### Encode labels


In [11]:
subset_df["Is Fraud?"] = (subset_df["Is Fraud?"] == "Yes").astype(int)

###  Train Test Split

In [12]:
subset_df.head()

Unnamed: 0,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Is Fraud?
9528451,5,12,23:34,$-87.00,Swipe Transaction,-5162038175624867091,Alpine,UT,84004.0,5541,0
19199242,11,15,11:19,$4.37,Chip Transaction,4722913068560264812,Des Moines,IA,50317.0,5411,0
18728798,5,1,18:00,$18.85,Chip Transaction,-5162038175624867091,Las Vegas,NV,89118.0,5541,0
21451881,10,27,13:55,$19.77,Swipe Transaction,-7146670748125200898,Bellwood,IL,60104.0,5970,0
15483620,6,27,06:22,$1.35,Chip Transaction,6666504894937430109,Indianapolis,IN,46256.0,5499,0


In [13]:
from cuml.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X=subset_df, y="Is Fraud?", shuffle=True, test_size=0.2, random_state=SEED)

### Handle Missing Values

In [14]:
X_train.isna().sum()/len(X_train) * 100

Month              0.0000
Day                0.0000
Time               0.0000
Amount             0.0000
Use Chip           0.0000
Merchant Name      0.0000
Merchant City      0.0000
Merchant State    11.1525
Zip               11.8080
MCC                0.0000
dtype: float64

In [15]:
X_train.loc[X_train["Merchant City"]=="ONLINE", "Merchant State"] = "ONLINE" 
X_train.loc[X_train["Merchant City"]=="ONLINE", "Zip"] = "ONLINE" 

X_test.loc[X_test["Merchant City"]=="ONLINE", "Merchant State"] = "ONLINE" 
X_test.loc[X_test["Merchant City"]=="ONLINE", "Zip"] = "ONLINE" 

In [16]:
us_states_plus_online = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'ONLINE']

In [17]:
X_test.loc[~X_test["Merchant State"].isin(us_states_plus_online), "Zip"] = "FOREIGN"
X_test.loc[~X_test["Merchant State"].isin(us_states_plus_online), "Zip"] = "FOREIGN"

In [18]:
X_train.isna().sum()/len(X_train) * 100

Month             0.0000
Day               0.0000
Time              0.0000
Amount            0.0000
Use Chip          0.0000
Merchant Name     0.0000
Merchant City     0.0000
Merchant State    0.0000
Zip               0.6555
MCC               0.0000
dtype: float64

In [19]:
X_test.isna().sum()/len(X_test) * 100

Month             0.0
Day               0.0
Time              0.0
Amount            0.0
Use Chip          0.0
Merchant Name     0.0
Merchant City     0.0
Merchant State    0.0
Zip               0.0
MCC               0.0
dtype: float64

In [20]:
X_train.head()

Unnamed: 0,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC
17345379,8,21,20:21,$32.64,Swipe Transaction,-7052069146128772826,Plattsburgh,NY,12901.0,5814
5867631,9,5,13:16,$0.44,Swipe Transaction,2027553650310142703,Latrobe,PA,15650.0,5541
23316032,3,31,13:32,$2.49,Chip Transaction,-6571010470072147219,Richmond,CA,94804.0,5499
1669285,1,21,17:28,$46.46,Swipe Transaction,-4500542936415012428,Reisterstown,MD,21136.0,5814
8009537,6,22,12:47,$109.48,Swipe Transaction,-2191856220998903211,Grand Junction,CO,81506.0,5310


### Handle Amount and Time

In [21]:
X_train['Amount'] = X_train['Amount'].str.slice(1).astype('float32')
X_test['Amount'] = X_test['Amount'].str.slice(1).astype('float32')

In [22]:
X_train['Hour'] = X_train['Time'].str.slice(stop=2).astype('int32')
X_train['Minute'] = X_train['Time'].str.slice(start=3).astype('int32')
X_train.drop(columns=['Time'], inplace=True)

In [23]:
X_test['Hour'] = X_test['Time'].str.slice(stop=2).astype('int32')
X_test['Minute'] = X_test['Time'].str.slice(start=3).astype('int32')
X_test.drop(columns=['Time'], inplace=True)

In [24]:
X_train.head()

Unnamed: 0,Month,Day,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Hour,Minute
17345379,8,21,32.639999,Swipe Transaction,-7052069146128772826,Plattsburgh,NY,12901.0,5814,20,21
5867631,9,5,0.44,Swipe Transaction,2027553650310142703,Latrobe,PA,15650.0,5541,13,16
23316032,3,31,2.49,Chip Transaction,-6571010470072147219,Richmond,CA,94804.0,5499,13,32
1669285,1,21,46.459999,Swipe Transaction,-4500542936415012428,Reisterstown,MD,21136.0,5814,17,28
8009537,6,22,109.480003,Swipe Transaction,-2191856220998903211,Grand Junction,CO,81506.0,5310,12,47


### Target Encoding
 We have high cardinality categorical columns in our data like merchant_city, merchant_state, zip, mcc (stands for merchant category code) with large number of unique categories. If we were to one-hot encode these, our feature set would blow up and we would be hit hard with the curse of dimensionality. Additionally it would lead to either huge memory consumption and very sparse data.  For Categorical Columns with lots of levels instead of One-Hot Encoding we can use TargetEncoding where each category in the column is replaced with the mean target value for that category. This way we can still effectively represent a categorical column and it only takes up the space of one feature. cuML's implementation of TargetEncoding uses several optimizations to prevent label leakage and parallelize the execution. To learn more about Target Encoding in cuML check out this target encoder walkthrough.

In [25]:
from cuml.preprocessing import TargetEncoder

In [26]:
high_cardinality_cols = ["Merchant Name", "Merchant City", "Merchant State", "Zip", "MCC"]
for col in high_cardinality_cols:
    # we append TE to column name to indicate we have target encoded it
    out_col = f'{col}_TE'
    tgt_encoder = TargetEncoder(smooth=0.001)
    X_train[out_col] = tgt_encoder.fit_transform(X_train[col], y_train)
    X_test[out_col] = tgt_encoder.transform(X_test[col])
# drop old columns
X_train.drop(columns=high_cardinality_cols, inplace=True)
X_test.drop(columns=high_cardinality_cols, inplace=True)

In [27]:
X_train.head()

Unnamed: 0,Month,Day,Amount,Use Chip,Hour,Minute,Merchant Name_TE,Merchant City_TE,Merchant State_TE,Zip_TE,MCC_TE
17345379,8,21,32.639999,Swipe Transaction,20,21,2.584383e-09,1.038127e-08,0.0002842525,1.038127e-08,0.000273
5867631,9,5,0.44,Swipe Transaction,13,16,7.844992e-05,8.333277e-09,1.192562e-10,8.333277e-09,9.2e-05
23316032,3,31,2.49,Chip Transaction,13,32,1.71185e-10,0.001094092,0.0002205072,3.319774e-09,6e-05
1669285,1,21,46.459999,Swipe Transaction,17,28,5.508091e-10,1.012388e-08,2.608603e-10,1.012388e-08,0.000544
8009537,6,22,109.480003,Swipe Transaction,12,47,6.121939e-07,8.566374e-09,0.0003053438,1.113626e-08,0.004292


### One Hot Encoding
We will now one-hot encode rest of the low cardinality categorical columns like `Use Chip` which has 3 unique categories. We can easily accomplish this through cudf's get_dummies function (just like in Pandas).

In [28]:
oneh_enc_cols = ["Use Chip"]
X_train = cudf.get_dummies(X_train)
X_test = cudf.get_dummies(X_test)

In [29]:
X_train.head()

Unnamed: 0,Month,Day,Amount,Hour,Minute,Merchant Name_TE,Merchant City_TE,Merchant State_TE,Zip_TE,MCC_TE,Use Chip_Chip Transaction,Use Chip_Online Transaction,Use Chip_Swipe Transaction
17345379,8,21,32.639999,20,21,2.584383e-09,1.038127e-08,0.0002842525,1.038127e-08,0.000273,0,0,1
5867631,9,5,0.44,13,16,7.844992e-05,8.333277e-09,1.192562e-10,8.333277e-09,9.2e-05,0,0,1
23316032,3,31,2.49,13,32,1.71185e-10,0.001094092,0.0002205072,3.319774e-09,6e-05,1,0,0
1669285,1,21,46.459999,17,28,5.508091e-10,1.012388e-08,2.608603e-10,1.012388e-08,0.000544,0,0,1
8009537,6,22,109.480003,12,47,6.121939e-07,8.566374e-09,0.0003053438,1.113626e-08,0.004292,0,0,1


In [30]:
X_test.head()

Unnamed: 0,Month,Day,Amount,Hour,Minute,Merchant Name_TE,Merchant City_TE,Merchant State_TE,Zip_TE,MCC_TE,Use Chip_Chip Transaction,Use Chip_Online Transaction,Use Chip_Swipe Transaction
23091900,2,26,33.139999,10,24,1.605502e-09,0.006971531,0.006971531,0.006971531,6.663765e-11,0,1,0
19137996,6,11,67.110001,19,38,7.655772e-08,3.310802e-09,0.0002131994,3.310802e-09,0.001242545,0,0,1
5510331,3,6,70.599998,19,53,5.642558e-10,5.125502e-09,2.514367e-10,5.125502e-09,0.0002715455,0,0,1
1107242,7,17,0.69,7,41,3.183471e-10,1.065208e-08,9.847377e-05,1.611821e-08,9.092565e-05,0,0,1
15098562,12,24,58.200001,18,49,0.0001652621,2.378636e-09,0.0002488491,2.406675e-09,0.00064555,0,0,1


In [31]:
# from cuml.preprocessing import SimpleImputer
# mode_cols = ["Merchant State", "Zip"]

# for col in mode_cols:
#     mode_imputer = SimpleImputer(sarategy="most_frequent")
#     X_train[[col]] = mode_imputer.fit_transform(X_train[[col]])
#     X_test[[col]] = mode_imputer.transform(X_test[[col]])
    
# for col in mode_cols:
#     mode_imputer = SimpleImputer(strategy="constant", fill_value='UNKNOWN')
#     X_train[[col]] = mode_imputer.fit_transform(X_train[[col]])
#     X_test[[col]] = mode_imputer.transform(X_test[[col]])

## Train XGBoost

In [32]:
import xgboost as xgb

In [33]:
xgb.__version__

'1.5.2'

In [34]:
# Free up some room on the GPU by explicitly deleting dataframes
import gc
del gdf
del subset_df
gc.collect()

115

In [35]:
# Define model training function
def train_model(num_trees, max_depth):
    model = xgb.XGBClassifier(
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        eval_metric='aucpr',
        use_label_encoder=False,
        objective='binary:logistic',
        max_depth=max_depth,
        n_estimators=num_trees
    )
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)]
    )
    return model

In [60]:
# Train a large model with 5000 trees and a maximum depth of 12
model = train_model(5000, 12)

[0]	validation_0-aucpr:0.29707
[1]	validation_0-aucpr:0.28728
[2]	validation_0-aucpr:0.28588
[3]	validation_0-aucpr:0.27819
[4]	validation_0-aucpr:0.29361
[5]	validation_0-aucpr:0.30496
[6]	validation_0-aucpr:0.32412
[7]	validation_0-aucpr:0.32911
[8]	validation_0-aucpr:0.33057
[9]	validation_0-aucpr:0.32777
[10]	validation_0-aucpr:0.33703
[11]	validation_0-aucpr:0.34766
[12]	validation_0-aucpr:0.35306
[13]	validation_0-aucpr:0.35520
[14]	validation_0-aucpr:0.35772
[15]	validation_0-aucpr:0.36572
[16]	validation_0-aucpr:0.36640
[17]	validation_0-aucpr:0.36771
[18]	validation_0-aucpr:0.36767
[19]	validation_0-aucpr:0.36000
[20]	validation_0-aucpr:0.35838
[21]	validation_0-aucpr:0.36165
[22]	validation_0-aucpr:0.36252
[23]	validation_0-aucpr:0.36133
[24]	validation_0-aucpr:0.36218
[25]	validation_0-aucpr:0.36154
[26]	validation_0-aucpr:0.36016
[27]	validation_0-aucpr:0.36311
[28]	validation_0-aucpr:0.36323
[29]	validation_0-aucpr:0.36522
[30]	validation_0-aucpr:0.36607
[31]	validation_0-

In [61]:
# aucpr baseline
y_test.sum()/len(y_test) 

0.00136

In [68]:
y_score = model.predict_proba(X_test)[:, 1]



In [69]:
from sklearn.metrics import average_precision_score

y_true = y_test.to_numpy()
ap = average_precision_score(y_true, y_score)
print(f'Test Area Under Precision Recall Curve: {ap: 0.4f}')

Test Area Under Precision Recall Curve:  0.3452


In [64]:
model_path = "./xgboost_model.json"

In [66]:
model.save_model(model_path)

In [70]:
xgb = xgb.XGBClassifier()
xgb.load_model(model_path)


In [None]:
xgb