In [1]:
# !pip install -q kaggle

## Get Data

For this example, we use a [synthetic credit card transactions dataset](https://arxiv.org/abs/1910.03033) available on [Kaggle](https://www.kaggle.com/datasets/ealtman2019/credit-card-transactions). You can either directly download the dataset from this [Kaggle link](https://www.kaggle.com/datasets/ealtman2019/credit-card-transactions) and then upload it to your SageMaker notebook instance. Or you may fetch the data from Kaggle command line client using the following commands.

**NOTE:** You will need to make sure that your Kaggle credentials are [available](https://github.com/Kaggle/kaggle-api#api-credentials) either through a kaggle.json file or via environment variables.

In [2]:
# !kaggle datasets download -d ealtman2019/credit-card-transactions

In [3]:
# !unzip -u credit-card-transactions.zip

## Data Preprocessing

In [4]:
import cudf
from cuml.preprocessing import LabelEncoder
import numpy as np
import pickle
import os

In [5]:
data_path = './'

In [6]:
data_csv = 'credit_card_transactions-ibm_v2.csv'

In [7]:
data = cudf.read_csv(os.path.join(data_path, data_csv))

In [8]:
data.shape

(24386900, 15)

In [9]:
data['Zip'] = data['Zip'].astype('object').str.slice(0, -2)
data['MCC'] = data['MCC'].astype('object')
data["Merchant Name"] = data["Merchant Name"].astype("object")

In [10]:
N = 500_000
SEED = 42
data = data.sample(frac=0.6, random_state=SEED)

In [11]:
data.shape

(14632140, 15)

### Encode labels


In [12]:
data["Is Fraud?"] = (data["Is Fraud?"] == "Yes").astype(int)

### Handle Missing Values

In [13]:
data.isna().sum()/len(data) * 100

User               0.000000
Card               0.000000
Year               0.000000
Month              0.000000
Day                0.000000
Time               0.000000
Amount             0.000000
Use Chip           0.000000
Merchant Name      0.000000
Merchant City      0.000000
Merchant State    11.160726
Zip               11.806721
MCC                0.000000
Errors?           98.408162
Is Fraud?          0.000000
dtype: float64

In [14]:
data.loc[data["Merchant City"]=="ONLINE", "Merchant State"] = "ONLINE" 
data.loc[data["Merchant City"]=="ONLINE", "Zip"] = "ONLINE" 

In [15]:
data['Errors?'] = data['Errors?'].notna().astype(int)

In [16]:
us_states_plus_online = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'ONLINE']

data.loc[~data["Merchant State"].isin(us_states_plus_online), "Zip"] = "FOREIGN"

### Handle Amount and Time

In [17]:
data['Amount'] = data['Amount'].str.slice(1).astype('float32')
data['Hour'] = data['Time'].str.slice(stop=2).astype('int64')
data['Minute'] = data['Time'].str.slice(start=3).astype('int64')
data.drop(columns=['Time'], inplace=True)

In [18]:
data.head()

Unnamed: 0,User,Card,Year,Month,Day,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?,Hour,Minute
9528451,808,1,2010,5,12,-87.0,Swipe Transaction,-5162038175624867091,Alpine,UT,84004,5541,0,0,23,34
19199242,1555,0,2016,11,15,4.37,Chip Transaction,4722913068560264812,Des Moines,IA,50317,5411,0,0,11,19
18728798,1518,0,2016,5,1,18.85,Chip Transaction,-5162038175624867091,Las Vegas,NV,89118,5541,0,0,18,0
21451881,1753,4,1997,10,27,19.77,Swipe Transaction,-7146670748125200898,Bellwood,IL,60104,5970,0,0,13,55
15483620,1261,0,2017,6,27,1.35,Chip Transaction,6666504894937430109,Indianapolis,IN,46256,5499,0,0,6,22


###  Train Test Split

In [19]:
X = data.drop(['Is Fraud?'], axis=1)
y = data['Is Fraud?']

In [20]:
from cuml.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X=X, y=y, test_size=0.3, random_state=SEED, stratify=y)

In [21]:
# Free up some room on the GPU by explicitly deleting dataframes
import gc
del data
gc.collect()

210

### Encoding Categorical Columns

In [22]:
categorial_columns = ['Zip', 'MCC', 'Merchant Name', 'Use Chip', 'Merchant City', 'Merchant State']
encoders = {}

for col in categorial_columns:
    unique_values = X_train[col].unique().values_host
    X_test.loc[~X_test[col].isin(unique_values), col] = 'UNKNOWN'
    unique_values = np.append(unique_values, ['UNKNOWN'])
    le = LabelEncoder().fit(unique_values)
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    encoders[col] = le

In [23]:
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

In [24]:
X_train.dtypes

User                int64
Card                int64
Year                int64
Month               int64
Day                 int64
Amount            float32
Use Chip            uint8
Merchant Name      uint32
Merchant City      uint16
Merchant State      uint8
Zip                uint16
MCC                 uint8
Errors?             int64
Hour                int64
Minute              int64
dtype: object

In [27]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [28]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [29]:
X_train['label'] = y_train
X_test['label'] = y_test

In [30]:
X_train.to_parquet('X_train.parquet')
X_test.to_parquet('X_test.parquet')

In [31]:
import IPython

IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}