# 1. Load Dataset



In [1]:
import pandas as pd

## Load transaction data from CSV 


In [2]:
%%time
headers = ['cc_num','category','amt','is_fraud','merchant','transaction_date','transaction_weekday','transaction_month',
           'transaction_hour','gender','zip','city_pop','job','age','setting','age_group','distance_from_home']
dtypes = {'cc_num': 'category', 'category': 'category', 'amt': 'float64', 'is_fraud': 'int',
         'merchant':'category', 'transaction_weekday':'category','transaction_month':'category',
         'transaction_hour':'category','gender':'category','zip':'category','city_pop':'int',
         'job':'category','age': 'int', 'setting':'category','age_group':'category','distance_from_home':'float'}
parse_dates = ['transaction_date']

data = pd.read_csv('transactions.csv',header=0, names=headers, dtype=dtypes, parse_dates=parse_dates,skiprows=[1])


CPU times: user 19.7 s, sys: 1.89 s, total: 21.6 s
Wall time: 21.7 s


In [3]:
 data.dtypes

cc_num                       category
category                     category
amt                           float64
is_fraud                        int64
merchant                     category
transaction_date       datetime64[ns]
transaction_weekday          category
transaction_month            category
transaction_hour             category
gender                       category
zip                          category
city_pop                        int64
job                          category
age                             int64
setting                      category
age_group                    category
distance_from_home            float64
dtype: object

In [4]:
data.head()

Unnamed: 0,cc_num,category,amt,is_fraud,merchant,transaction_date,transaction_weekday,transaction_month,transaction_hour,gender,zip,city_pop,job,age,setting,age_group,distance_from_home
0,3517182278248964,gas_transport,771.21,1,fraud_Berge LLC,2021-12-27 05:38:44,0,12,5,F,37411,198659,Community arts worker,35,urban,26-35,134.025936
1,3517182278248964,home,257.39,1,fraud_Gerhold LLC,2021-12-28 23:19:40,1,12,23,F,37411,198659,Community arts worker,35,urban,26-35,47.677531
2,3517182278248964,misc_pos,91.43,0,"fraud_Boehm, Predovic and Reinger",2021-06-03 07:06:42,3,6,7,F,37411,198659,Community arts worker,35,urban,26-35,124.769004
3,3517182278248964,grocery_pos,101.76,0,fraud_Wolf Inc,2021-03-20 01:49:49,5,3,1,F,37411,198659,Community arts worker,35,urban,26-35,75.359127
4,3517182278248964,grocery_pos,67.26,0,fraud_Heidenreich PLC,2021-02-12 03:07:30,4,2,3,F,37411,198659,Community arts worker,35,urban,26-35,57.378602


## Confirm 17M+ transactions

In [5]:
data.shape

(17297323, 17)

# 2. Apply LightGBM Feature Pre-Processing 

LightGBM offers good accuracy with integer-encoded categorical features. LightGBM applies Fisher (1958) to find the optimal split over categories. This often performs better than one-hot encoding.

https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html#categorical-feature-support


## 2.1 Mappings
So let's create a Map (Dictionary) for every categorical feature. 
For each feature, a python dictionary holds the transformation from feature value to integer

For example, for the categorical feature "gender", we will have a gender_dict with values
{'F': 0, 'M': 1} 

In [6]:
%%time 
cc_num_dict = dict (zip(data['cc_num'], data['cc_num'].cat.codes)) 
category_dict = dict (zip(data['category'], data['category'].cat.codes)) 
merchant_dict = dict (zip(data['merchant'], data['merchant'].cat.codes))
transaction_weekday_dict = dict (zip(data['transaction_weekday'], data['transaction_weekday'].cat.codes)) 
transaction_month_dict = dict (zip(data['transaction_month'], data['transaction_month'].cat.codes))
transaction_hour_dict = dict (zip(data['transaction_hour'], data['transaction_hour'].cat.codes))
gender_dict = dict (zip(data['gender'], data['gender'].cat.codes))
age_group_dict = dict (zip(data['age_group'], data['age_group'].cat.codes)) 
job_dict = dict (zip(data['job'], data['job'].cat.codes)) 
setting_dict = dict (zip(data['setting'], data['setting'].cat.codes)) 
zip_dict = dict (zip(data['zip'], data['zip'].cat.codes)) 


CPU times: user 12.1 s, sys: 436 ms, total: 12.5 s
Wall time: 12.5 s


### Let's check the generated gender_dict

In [7]:
gender_dict

{'F': 0, 'M': 1}

## 2.2 Save Mappings to JSON files 
We will use these later...


In [8]:
import json

def save_dict (dictionary, filename):
    with open(filename, 'w') as convert_file:
        convert_file.write(json.dumps(dictionary))
    

In [9]:
save_dict(cc_num_dict,'cc_num_dict.json')
save_dict(category_dict,'category_dict.json')
save_dict(merchant_dict,'merchant_dict.json')
save_dict(transaction_weekday_dict,'transaction_weekday_dict.json')
save_dict(transaction_month_dict,'transaction_month_dict.json')
save_dict(transaction_hour_dict,'transaction_hour_dict.json')
save_dict(gender_dict,'gender_dict.json')
save_dict(age_group_dict,'age_group_dict.json')
save_dict(job_dict,'job_dict.json')
save_dict(setting_dict,'setting_dict.json')
save_dict(zip_dict,'zip_dict.json')

### Notes for Inference


#### Converting Categorical values to integer values used during training

At inference time, your LightGBM model requires the integer associated with a categorical feature value.
Therefore, the above dictionaries will be used to pre-process transaction data as it streams through Hazelcast

We will load these dictionaries into a Hazelcast later...

#### Dealing with unknown categorical values at inference time
In production, there needs to be an elegant way to deal with unknown values. 
For example imagine a new "job" is added after the model is deployed to production.
The new value for job has not seen during training and a result there is no mapping to an integer value. 

The data scientists will need to ensure the model is robust to unseen/unknown categorical values...


For now, let's focus on training and deploy the model assuming the no unseen/unknown categorical values.





In [8]:
## Save minimal test dataset - We will stream these data through a Hazelcast Fraud detection pipeline later
test_data = data[data['transaction_date'] > '2022-06-30 23:59:59']
test_data = test_data[['cc_num','amt','merchant','transaction_date','is_fraud']]


In [9]:
test_data.to_csv('transaction_data_stream.csv', index=False)

In [10]:
test_data.head()

Unnamed: 0,cc_num,amt,merchant,transaction_date,is_fraud
864,3517182278248964,327.63,fraud_Erdman-Schaden,2022-09-16 18:35:45,0
869,3517182278248964,3.55,"fraud_Roberts, Daniel and Macejkovic",2022-07-28 23:31:46,0
870,3517182278248964,42.47,"fraud_Medhurst, Cartwright and Ebert",2022-09-20 22:00:58,0
872,3517182278248964,10.21,fraud_Baumbach Ltd,2022-09-28 17:22:23,0
876,3517182278248964,62.99,fraud_Wilkinson LLC,2022-10-01 17:49:11,0


In [11]:
test_data.shape

(2981444, 5)

## 2.1 Converting Categorical values to integer values
At inference time, The LightGBM model will require the integer associated with any categorical feature value. Therefore, these dictionaries will now be used to pre-process transaction data.

In production, the same exact conversions will be needed to process transactions as they stream through Hazelcast



In [12]:
%%time 
data['category'] = data['category'].replace(category_dict)
data['category'] = data['category'].astype(int)
data['merchant'] = data['merchant'].replace(merchant_dict)
data['merchant'] = data['merchant'].astype(int)



CPU times: user 9.2 s, sys: 4.82 s, total: 14 s
Wall time: 14.4 s


In [13]:
%%time
data['transaction_weekday'] = data['transaction_weekday'].replace(transaction_weekday_dict)
data['transaction_weekday'] = data['transaction_weekday'].astype(int)
data['transaction_month'] = data['transaction_month'].replace(transaction_month_dict)
data['transaction_month'] = data['transaction_month'].astype(int)
data['transaction_hour'] = data['transaction_hour'].replace(transaction_hour_dict)
data['transaction_hour'] = data['transaction_hour'].astype(int)
data['gender'] = data['gender'].replace(gender_dict)
data['gender'] = data['gender'].astype(int)
data['zip'] = data['zip'].replace(zip_dict)
data['zip'] = data['zip'].astype(int) 


CPU times: user 1min 30s, sys: 38.6 s, total: 2min 8s
Wall time: 2min 15s


In [14]:
%%time
data['job'] = data['job'].replace(job_dict)
data['job'] = data['job'].astype(int)
data['setting'] = data['setting'].replace(setting_dict)
data['setting'] = data['setting'].astype(int)
data['age_group'] = data['age_group'].replace(age_group_dict)
data['age_group'] = data['age_group'].astype(int)


CPU times: user 7.62 s, sys: 2.67 s, total: 10.3 s
Wall time: 10.4 s


In [15]:
%%time
data['cc_num'] = data['cc_num'].replace(cc_num_dict)
data['cc_num'] = data['cc_num'].astype(int)


CPU times: user 2min 15s, sys: 53 s, total: 3min 8s
Wall time: 3min 18s


In [17]:
data.dtypes

cc_num                          int64
category                        int64
amt                           float64
is_fraud                        int64
merchant                        int64
transaction_date       datetime64[ns]
transaction_weekday             int64
transaction_month               int64
transaction_hour                int64
gender                          int64
zip                             int64
city_pop                        int64
job                             int64
age                             int64
setting                         int64
age_group                       int64
distance_from_home            float64
dtype: object

In [18]:
%%time
data = data.sort_values(by='transaction_date')


CPU times: user 6.28 s, sys: 854 ms, total: 7.13 s
Wall time: 7.13 s


In [19]:
data['transaction_date'].min()

Timestamp('2020-11-01 00:00:00')

In [20]:
data['transaction_date'].max()

Timestamp('2022-11-01 23:59:56')

# 3. Train LightGBM Model

Let's train our model on "historical" transactions (transactions before 2022-06-30)

We will test our model on more "recent" data (transactions after 2022-06-30)

## Split data into training and test sets

Remember to use the last 2 months of transaction data as test set!

In [21]:
train_data  = data[data['transaction_date'] <= '2022-06-30 23:59:59']
test_data = data[data['transaction_date'] > '2022-06-30 23:59:59']


#training set
X_train = train_data.drop(['is_fraud'], axis=1)
y_train = train_data['is_fraud']

#test set
X_test = test_data.drop(['is_fraud'], axis=1)
y_test = test_data['is_fraud']

# Drop transaction date (it was only needed to split train/test sets)
X_train = X_train.drop(['transaction_date'], axis=1)
X_test  = X_test.drop(['transaction_date'], axis=1)


In [22]:
print (f'training model on {X_train.shape[0]} historical transactions')
print (f'testing  model on {X_test.shape[0]} recent transactions')

training model on 14315879 historical transactions
testing  model on 2981444 recent transactions


## 3.1 LightGBM Hyper Parameters


In [23]:
import lightgbm as lgbm
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score


In [24]:
# LightGBM Hyper Parameters

SEARCH_PARAMS = {'learning_rate': 0.2,
                 'max_depth': 15,
                 'num_leaves': 20,
                 'feature_fraction': 0.8,
                 'subsample': 0.2}

FIXED_PARAMS={'objective': 'binary',
              'metric': 'auc',
              'scale_pos_weight':130,
              'boosting':'gbdt',
              'early_stopping_rounds':30}

TRAINING_RUN_PARAMS = {**FIXED_PARAMS, **SEARCH_PARAMS}

## 3.2 Model Training time

Train LightGBM model with an initial set of hyperparameters

The full hyper-parameter tuning exercise (with GridSearchCV or tool of choice) is left as an exercise!

In [25]:
%%time
# Create a LightGBM Binary Classifier (note objective parameter is set to ='binary')
clf = lgbm.LGBMClassifier(**TRAINING_RUN_PARAMS)
# Train
clf.fit(X=X_train, y=y_train, eval_set = (X_test, y_test))

[1]	valid_0's auc: 0.623191
[2]	valid_0's auc: 0.499737
[3]	valid_0's auc: 0.781455
[4]	valid_0's auc: 0.832385
[5]	valid_0's auc: 0.814505
[6]	valid_0's auc: 0.814657
[7]	valid_0's auc: 0.81595
[8]	valid_0's auc: 0.838918
[9]	valid_0's auc: 0.760899
[10]	valid_0's auc: 0.761601
[11]	valid_0's auc: 0.758864
[12]	valid_0's auc: 0.758399
[13]	valid_0's auc: 0.789298
[14]	valid_0's auc: 0.789426
[15]	valid_0's auc: 0.789082
[16]	valid_0's auc: 0.799909
[17]	valid_0's auc: 0.821618
[18]	valid_0's auc: 0.826271
[19]	valid_0's auc: 0.827163
[20]	valid_0's auc: 0.826817
[21]	valid_0's auc: 0.848776
[22]	valid_0's auc: 0.852605
[23]	valid_0's auc: 0.866402
[24]	valid_0's auc: 0.888746
[25]	valid_0's auc: 0.888698
[26]	valid_0's auc: 0.889564
[27]	valid_0's auc: 0.897757
[28]	valid_0's auc: 0.869168
[29]	valid_0's auc: 0.873105
[30]	valid_0's auc: 0.893196
[31]	valid_0's auc: 0.892546
[32]	valid_0's auc: 0.896737
[33]	valid_0's auc: 0.896012
[34]	valid_0's auc: 0.895816
[35]	valid_0's auc: 0.89

# 4. Check Model Accuracy

Generate Predictions for Training and Test sets

In [26]:
%%time
y_pred_train = clf.predict(X_train)


CPU times: user 20.1 s, sys: 978 ms, total: 21.1 s
Wall time: 4.29 s


In [27]:
%%time
y_pred_test  = clf.predict(X_test)

CPU times: user 4.02 s, sys: 97.1 ms, total: 4.12 s
Wall time: 710 ms


In [28]:
import numpy as np
np.unique(train_data['is_fraud'].to_numpy(),return_counts=True)

(array([0, 1]), array([14236757,    79122]))

In [29]:
np.unique(y_pred_train,return_counts=True)

(array([0, 1]), array([13255834,  1060045]))

### Measure balanced accuracy score


In the binary classification, balanced accuracy is equal to the arithmetic mean of sensitivity (true positive rate) and specificity (true negative rate), or the area under the ROC curve with binary predictions rather than scores

https://scikit-learn.org/stable/modules/model_evaluation.html#balanced-accuracy-score

In [30]:
# Training set score
print('LightGBM Model training-set accuracy score: {0:0.4f}'. format(balanced_accuracy_score(y_train, y_pred_train)))
# Testing set score
print('LightGBM Model testing-set accuracy score: {0:0.4f}'.format(balanced_accuracy_score(y_test, y_pred_test)))

LightGBM Model training-set accuracy score: 0.9144
LightGBM Model testing-set accuracy score: 0.8911


## 4.1 Generate a Prediction for a Transaction

We will use this later ....

In [31]:
print("prediction", clf.predict(X_test[1:2]))
print("predict_proba", clf.predict_proba(X_test[1:2]))

prediction [1]
predict_proba [[0.35324393 0.64675607]]


## In the real world, additional model experimentation is needed
Clearly, the above accuracy metrics indicate that the model may be overfitting to the training set.

The data scientists need to continue experimenting to improve this model

For now, let's assume the model performance is good enough to be shipped to production




# 5. Export LightGBM Model to ONNX


## 5.1 Register the LightGBM converter

In [33]:
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes  # noqa
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm  # noqa
from skl2onnx.common.data_types import FloatTensorType
import numpy
import onnxruntime as rt

#register converter
update_registered_converter(
    lgbm.LGBMClassifier, 'LightGbmLGBMClassifier',
    calculate_linear_classifier_output_shapes, convert_lightgbm,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})


## 5.2 Convert the model to ONNX 
Saving it to a file named <b>"lightgbm_fraud_detection_onnx"</b>

In [34]:
model_onnx = convert_sklearn(
    clf, 'lightgbm',
    [('input', FloatTensorType([None, 15]))],
    target_opset={'': 12, 'ai.onnx.ml': 2})

# And save.
with open("lightgbm_fraud_detection_onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

## 5.3 Check ONNX model predictions

We'd like to test that: 
* a) The ONNX model is able to generate predictions AND
* b) A prediction with the ONNX model matches the prediction made by the origial lightGBM model

Let's load the ONNX model and generate a prediction for the same transaction from the test set

In [35]:
sess = rt.InferenceSession("lightgbm_fraud_detection_onnx")
transaction_row_start = 1
    
pred_onx = sess.run(None, {"input": X_test[transaction_row_start:transaction_row_start+1].astype(numpy.float32).to_numpy()})
print("ONNX MODEL PREDICTION")
print("predict", pred_onx[0])
print("predict_proba",  pred_onx[1][:1])

ONNX MODEL PREDICTION
predict [1]
predict_proba [{0: 0.3532440662384033, 1: 0.6467559337615967}]


### Check that the predictions match
and now let's check the prediction by the original LightGBM model

In [36]:
print("LIGHTGBM MODEL PREDICTION")
print("prediction", clf.predict(X_test[transaction_row_start:transaction_row_start+1]))
print("predict_proba", clf.predict_proba(X_test[transaction_row_start:transaction_row_start+1]))

LIGHTGBM MODEL PREDICTION
prediction [1]
predict_proba [[0.35324393 0.64675607]]
