In [1]:
!pip install -q kaggle

[0m

## Get Data

For this example, we use a [synthetic credit card transactions dataset](https://arxiv.org/abs/1910.03033) available on [Kaggle](https://www.kaggle.com/datasets/ealtman2019/credit-card-transactions). You can either directly download the dataset from this [Kaggle link](https://www.kaggle.com/datasets/ealtman2019/credit-card-transactions) and then upload it to your SageMaker notebook instance. Or you may fetch the data from Kaggle command line client using the following commands.

**NOTE:** You will need to make sure that your Kaggle credentials are [available](https://github.com/Kaggle/kaggle-api#api-credentials) either through a kaggle.json file or via environment variables.

In [2]:
!kaggle datasets download -d ealtman2019/credit-card-transactions

Traceback (most recent call last):
  File "/opt/conda/envs/rapids/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/opt/conda/envs/rapids/lib/python3.8/site-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/opt/conda/envs/rapids/lib/python3.8/site-packages/kaggle/api/kaggle_api_extended.py", line 164, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [3]:
!unzip -u credit-card-transactions.zip

Archive:  credit-card-transactions.zip


## Data Preprocessing

In [66]:
import cudf
import cuml
from cuml.preprocessing import LabelEncoder
import numpy as np
import pickle
import os

In [5]:
data_path = './'

In [6]:
data_csv = 'credit_card_transactions-ibm_v2.csv'

In [7]:
data = cudf.read_csv(os.path.join(data_path, data_csv))

In [8]:
data.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No


In [9]:
data.shape

(24386900, 15)

In [10]:
data['Zip'] = data['Zip'].astype('object')
data['MCC'] = data['MCC'].astype('object')
data["Merchant Name"] = data["Merchant Name"].astype("object")

In [11]:
SEED = 42
data = data.sample(frac=0.6, random_state=SEED)
data = data.reset_index(drop=True)

In [12]:
data.shape

(14632140, 15)

### Encode labels


In [13]:
y = data['Is Fraud?']
data.drop(columns=['Is Fraud?'], inplace=True)
y = (y == "Yes").astype(int)

### Save subset for inference

In [14]:
data_infer = data.sample(n=100, random_state=SEED).reset_index(drop=True).astype(str)
data_infer.to_csv('data_infer.csv', index=False)

### Handle Missing Values

In [15]:
data.isna().sum()/len(data) * 100

User               0.000000
Card               0.000000
Year               0.000000
Month              0.000000
Day                0.000000
Time               0.000000
Amount             0.000000
Use Chip           0.000000
Merchant Name      0.000000
Merchant City      0.000000
Merchant State    11.160726
Zip               11.806721
MCC                0.000000
Errors?           98.408162
dtype: float64

In [16]:
data.loc[data["Merchant City"]=="ONLINE", "Merchant State"] = "ONLINE" 
data.loc[data["Merchant City"]=="ONLINE", "Zip"] = "ONLINE" 

In [17]:
data['Errors?'] = data['Errors?'].notna().astype(int)

In [18]:
us_states_plus_online = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'ONLINE']

data.loc[~data["Merchant State"].isin(us_states_plus_online), "Zip"] = "FOREIGN"

### Handle Amount and Time

In [19]:
data['Amount'] = data['Amount'].str.slice(1).astype('float32')
data['Hour'] = data['Time'].str.slice(stop=2).astype('int64')
data['Minute'] = data['Time'].str.slice(start=3).astype('int64')
data.drop(columns=['Time'], inplace=True)

###  Train Test Split

In [20]:
from cuml.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=SEED, stratify=y)

In [21]:
# Free up some room on the GPU by explicitly deleting dataframes
import gc
del data
del y
gc.collect()

410

### Encoding Categorical Columns

In [22]:
categorial_columns = ['Zip', 'MCC', 'Merchant Name', 'Use Chip', 'Merchant City', 'Merchant State']
encoders = {}

for col in categorial_columns:
    unique_values = X_train[col].unique().values_host
    X_test.loc[~X_test[col].isin(unique_values), col] = 'UNKNOWN'
    unique_values = np.append(unique_values, ['UNKNOWN'])
    # convert to cudf series
    unique_values = cudf.Series(unique_values)
    le = LabelEncoder().fit(unique_values)
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    encoders[col] = le.classes_.values_host

In [23]:
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

In [24]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

## Train XGBoost

In [25]:
import xgboost as xgb

dtrain = xgb.DMatrix(
        X_train,
        y_train )

dtest = xgb.DMatrix(
        X_test,
        y_test    )

xgb_params = {
    'max_depth':         8,
    'tree_method':       'gpu_hist',
    'objective':         'binary:logistic',
    'eval_metric':       'aucpr',
    'predictor':         'gpu_predictor',
}
n_rounds = 2000

In [26]:
import time
start = time.time()
model = xgb.train(params=xgb_params, 
                       dtrain=dtrain, 
                       num_boost_round=n_rounds)
print("Training Time", time.time()-start, "seconds")

Training Time 173.77175045013428 seconds


In [27]:
y_score = model.predict(dtest)
threshold = 0.5
y_pred = (y_score >= 0.5).astype(int)

In [28]:
from sklearn.metrics import f1_score

y_true = y_test.values_host
f1 = f1_score(y_true, y_pred)
print(f'Test F1-Score: {f1: 0.4f}')

Test F1-Score:  0.8588


### Save Trained Model

In [60]:
model_path = "./xgboost.json"

In [61]:
model.save_model(model_path)

## CPU GPU FIL Benchmarks

In [82]:
%%time
model.set_param({"predictor": "cpu_predictor"})
cpu_preds = model.predict(dtrain)

CPU times: user 25 ms, sys: 7.96 ms, total: 33 ms
Wall time: 63.6 ms


In [83]:
%%time
model.set_param({"predictor": "gpu_predictor"})
gpu_preds = model.predict(dtrain)

CPU times: user 18.2 ms, sys: 3.9 ms, total: 22.1 ms
Wall time: 21.5 ms


###  Prediction using Forest Inference Library (FIL)

In [76]:
# Call FIL inference from cuml and load model and params
fil = cuml.ForestInference.load(
      filename=model_path,                                    
      output_class=True,    
      model_type='xgboost_json')

In [81]:
%%time
fil_preds = fil.predict_proba(X_train)

CPU times: user 5.57 s, sys: 2.76 s, total: 8.32 s
Wall time: 8.44 s


In [28]:
import IPython

IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}