In [1]:
!pip install -q kaggle

[0m

## Get Data

For this example, we use a [synthetic credit card transactions dataset](https://arxiv.org/abs/1910.03033) available on [Kaggle](https://www.kaggle.com/datasets/ealtman2019/credit-card-transactions). You can either directly download the dataset from this [Kaggle link](https://www.kaggle.com/datasets/ealtman2019/credit-card-transactions) and then upload it to your SageMaker notebook instance. Or you may fetch the data from Kaggle command line client using the following commands.

**NOTE:** You will need to make sure that your Kaggle credentials are [available](https://github.com/Kaggle/kaggle-api#api-credentials) either through a kaggle.json file or via environment variables.

In [2]:
!kaggle datasets download -d ealtman2019/credit-card-transactions

Traceback (most recent call last):
  File "/opt/conda/envs/rapids/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/opt/conda/envs/rapids/lib/python3.8/site-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/opt/conda/envs/rapids/lib/python3.8/site-packages/kaggle/api/kaggle_api_extended.py", line 164, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


In [3]:
!unzip -u credit-card-transactions.zip

Archive:  credit-card-transactions.zip


## Data Preprocessing

In [4]:
import cudf
from cuml.preprocessing import LabelEncoder
import numpy as np
import pickle
import os

In [5]:
data_path = './'

In [6]:
data_csv = 'credit_card_transactions-ibm_v2.csv'

In [7]:
data = cudf.read_csv(os.path.join(data_path, data_csv))

In [8]:
data.shape

(24386900, 15)

In [9]:
data['Zip'] = data['Zip'].astype('object')
data['MCC'] = data['MCC'].astype('object')
data["Merchant Name"] = data["Merchant Name"].astype("object")

In [10]:
SEED = 42
data = data.sample(frac=0.6, random_state=SEED)
data = data.reset_index(drop=True)

In [11]:
data.shape

(14632140, 15)

### Encode labels


In [12]:
y = data['Is Fraud?']
data.drop(columns=['Is Fraud?'], inplace=True)
y = (y == "Yes").astype(int)

### Save subset for inference

In [13]:
data_infer = data.sample(n=100, random_state=SEED).reset_index(drop=True).astype(str)
data_infer.to_csv('data_infer.csv', index=False)

### Handle Missing Values

In [14]:
data.isna().sum()/len(data) * 100

User               0.000000
Card               0.000000
Year               0.000000
Month              0.000000
Day                0.000000
Time               0.000000
Amount             0.000000
Use Chip           0.000000
Merchant Name      0.000000
Merchant City      0.000000
Merchant State    11.160726
Zip               11.806721
MCC                0.000000
Errors?           98.408162
dtype: float64

In [15]:
data.loc[data["Merchant City"]=="ONLINE", "Merchant State"] = "ONLINE" 
data.loc[data["Merchant City"]=="ONLINE", "Zip"] = "ONLINE" 

In [16]:
data['Errors?'] = data['Errors?'].notna().astype(int)

In [17]:
us_states_plus_online = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'ONLINE']

data.loc[~data["Merchant State"].isin(us_states_plus_online), "Zip"] = "FOREIGN"

### Handle Amount and Time

In [18]:
data['Amount'] = data['Amount'].str.slice(1).astype('float32')
data['Hour'] = data['Time'].str.slice(stop=2).astype('int64')
data['Minute'] = data['Time'].str.slice(start=3).astype('int64')
data.drop(columns=['Time'], inplace=True)

###  Train Test Split

In [19]:
from cuml.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=SEED, stratify=y)

In [20]:
# Free up some room on the GPU by explicitly deleting dataframes
import gc
del data
del y
gc.collect()

294

### Encoding Categorical Columns

In [21]:
X_train.dtypes

User                int64
Card                int64
Year                int64
Month               int64
Day                 int64
Amount            float32
Use Chip           object
Merchant Name      object
Merchant City      object
Merchant State     object
Zip                object
MCC                object
Errors?             int64
Hour                int64
Minute              int64
dtype: object

In [22]:
categorial_columns = ['Zip', 'MCC', 'Merchant Name', 'Use Chip', 'Merchant City', 'Merchant State']
encoders = {}

for col in categorial_columns:
    unique_values = X_train[col].unique().values_host
    X_test.loc[~X_test[col].isin(unique_values), col] = 'UNKNOWN'
    unique_values = np.append(unique_values, ['UNKNOWN'])
    le = LabelEncoder().fit(unique_values)
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    encoders[col] = le.classes_.values_host

In [23]:
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

In [24]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [25]:
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

In [26]:
X_train['label'] = y_train
X_test['label'] = y_test

In [27]:
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)

In [28]:
X_train.to_parquet('X_train.parquet')
X_test.to_parquet('X_test.parquet')

In [29]:
X_train.head()

Unnamed: 0,User,Card,Year,Month,Day,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Hour,Minute,label
0,677.0,1.0,2010.0,12.0,18.0,30.129999,2.0,8386.0,10990.0,30.0,17993.0,56.0,0.0,6.0,29.0,0
1,1396.0,0.0,2004.0,8.0,20.0,32.16,2.0,42418.0,5274.0,189.0,20372.0,69.0,0.0,14.0,7.0,0
2,910.0,1.0,2017.0,12.0,7.0,100.0,0.0,15337.0,8078.0,148.0,2772.0,43.0,0.0,13.0,49.0,0
3,1085.0,2.0,2014.0,2.0,21.0,21.780001,2.0,77370.0,5608.0,143.0,10504.0,56.0,0.0,16.0,22.0,0
4,1674.0,1.0,2011.0,1.0,3.0,120.0,2.0,15337.0,10939.0,189.0,20467.0,43.0,0.0,21.0,9.0,0


In [28]:
import IPython

IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}