In [None]:
import numpy as np
import pandas as pd
import math
import torch
import io
import torch.nn as nn
import torch.optim as optim
import torch.utils as utils
from google.colab import files

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

Using cpu device


# Upload and Parse the CSV



In [None]:
trainset_file = files.upload()

Saving trainset.xlsx to trainset (2).xlsx


Read the file with pandas

In [None]:
df = pd.read_excel(io.BytesIO(trainset_file['trainset.xlsx']))
df

Unnamed: 0,TRAN_AMT,ACCT_PRE_TRAN_AVAIL_BAL,CUST_AGE,OPEN_ACCT_CT,WF_dvc_age,PWD_UPDT_TS,CARR_NAME,RGN_NAME,STATE_PRVNC_TXT,ALERT_TRGR_CD,DVC_TYPE_TXT,AUTHC_PRIM_TYPE_CD,AUTHC_SCNDRY_STAT_TXT,CUST_ZIP,CUST_STATE,PH_NUM_UPDT_TS,CUST_SINCE_DT,TRAN_TS,TRAN_DT,ACTN_CD,ACTN_INTNL_TXT,TRAN_TYPE_CD,ACTVY_DT,FRAUD_NONFRAUD
0,5.38,23619.91,47,4,2777,1/16/2018 11:3:58,cox communications inc.,southwest,nevada,MOBL,,UN_PWD,ALLOW,89002,NV,2/24/2021 15:55:10,1993-01-06,5/3/2021 18:3:58,5/3/2021,SCHPMT,P2P_COMMIT,P2P,5/3/2021,Non-Fraud
1,65.19,0.00,45,5,2721,,charter communications,southwest,california,MOBL,,FACE_ID,ALLOW,94541,CA,,1971-01-07,1/13/2021 19:19:37,1/13/2021,SCHPMT,P2P_COMMIT,P2P,1/13/2021,Non-Fraud
2,54.84,34570.63,36,8,1531,12/22/2021 10:42:51,utah broadband llc,mountain,utah,ONLN,DESKTOP,UN_PWD,ALLOW,21811,MD,5/5/2019 1:8:39,1994-02-01,4/8/2021 9:42:51,4/8/2021,SCHPMT,P2P_COMMIT,P2P,4/8/2021,Fraud
3,0.01,0.00,62,3,835,2/8/2020 7:28:31,t-mobile usa inc.,southwest,california,MOBL,MOBILE,UN_PWD,ALLOW,89822,NV,2/16/2019 6:45:37,2001-11-01,8/10/2021 15:28:31,8/10/2021,SCHPMT,P2P_COMMIT,P2P,8/10/2021,Non-Fraud
4,497.08,12725.18,81,2,1095,12/28/2020 12:12:44,cogent communications,south central,texas,MOBL,MOBILE,UN_PWD,CHALLENGE_SUCCESS,84108,UT,5/8/2020 10:27:6,1987-02-07,6/27/2021 11:12:44,6/27/2021,SCHPMT,P2P_COMMIT,P2P,6/27/2021,Fraud
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,1937.21,230.75,55,4,142,,cellco partnership dba verizon wireless,southwest,california,MOBL,MOBILE,UN_PWD,ALLOW,92503,CA,7/15/2017 6:58:59,2001-06-05,3/12/2021 12:11:59,3/12/2021,SCHPMT,P2P_COMMIT,P2P,3/12/2021,Non-Fraud
13996,114.38,0.00,44,10,272,11/2/2017 4:28:20,t-mobile usa inc.,southwest,california,MOBL,MOBILE,FACE_ID,ALLOW,80478,CO,,2010-06-03,6/11/2021 9:28:20,6/11/2021,SCHPMT,P2P_COMMIT,P2P,6/11/2021,Non-Fraud
13997,493.00,2848.63,54,3,517,6/3/2021 19:31:15,att services inc,southwest,california,MOBL,DESKTOP,UN_PWD,ALLOW,33579,FL,5/25/2021 8:50:5,1984-10-27,5/16/2021 12:31:15,5/16/2021,SCHPMT,P2P_COMMIT,P2P,5/16/2021,Fraud
13998,491.64,3163.25,21,3,0,3/2/2020 11:34:54,,,,ONLN,DESKTOP,UN_PWD,ALLOW,91702,CA,,2021-03-01,5/11/2021 12:34:54,5/11/2021,SCHPMT,P2P_COMMIT,P2P,5/11/2021,Fraud


# Format the Data into Training/Testing Tensors

Let's go ahead and duplicate the dataframe before we make any changes to it.

In [None]:
df_cleaned = df.copy()

Select a column from the dataframe as the output columns.

In [None]:
output_column = 'FRAUD_NONFRAUD'
output_labels = ['Non-Fraud', 'Fraud']

Fields that are numerical, then normalize the data.

In [None]:
number_fields = [
  'TRAN_AMT',
  'ACCT_PRE_TRAN_AVAIL_BAL',
  'CUST_AGE',
  'OPEN_ACCT_CT',
  'WF_dvc_age',
]

def normalizeNumberFields(df, fields):   
  for field in fields:
    df[field] = (df[field] - df[field].min()) / (df[field].max() - df[field].min())

normalizeNumberFields(df_cleaned, number_fields)

Create a String Embedder for the various categorical string fields in the dataframe, then store them.

In [None]:
string_fields = [
  'DVC_TYPE_TXT', 
  'CARR_NAME', 
  'RGN_NAME', 
  'STATE_PRVNC_TXT', 
  'ALERT_TRGR_CD', 
  'AUTHC_PRIM_TYPE_CD', 
  'AUTHC_SCNDRY_STAT_TXT', 
  'CUST_ZIP',
  'CUST_STATE',
  'ACTN_CD',
  'ACTN_INTNL_TXT',
  'TRAN_TYPE_CD',
]

def categorizeStringFields(df, fields): # takes a column with category strings and formats it to float
  field_category_to_number = {} # a 2d dict containing field->categories, category->number
  field_category_i = {}
  for i, row in df.iterrows():
    for field in fields:
      value = row[field]
      if field not in field_category_to_number: # add field if not seen yet
        field_category_to_number[field] = { np.nan: 0 }
        field_category_i[field] = 1
      if value not in field_category_to_number[field]: # add value if not seen yet
        field_category_to_number[field][value] = field_category_i[field]
        field_category_i[field] += 1
      df.at[i, field] = field_category_to_number[field][value]
  return field_category_to_number

# a dictionary containing a field's string to number lookup
field_category_to_number = categorizeStringFields(df_cleaned, string_fields)

Fields that are dates, ignore for now, need to come up with an encoding scheme.

In [None]:
date_fields = [
  'PWD_UPDT_TS',
  'PH_NUM_UPDT_TS',
  'CUST_SINCE_DT',
  'TRAN_TS',
  'TRAN_DT',
  'ACTVY_DT',
]

Split the contents of the Downloaded set into a Training and Testing set using a 80%, 20% split.

In [None]:
# select a random 80% as the training, 20% as testing
train_partition = df_cleaned.sample(frac = 0.8, random_state = 7892) # put in a random_state seed value
test_partition = df_cleaned.drop(train_partition.index) # then drop the indexes of the training set to create the testing set

# then eliminate columns we are not using
train = train_partition.drop(['FRAUD_NONFRAUD'], axis=1)
test = test_partition.drop(['FRAUD_NONFRAUD'], axis=1)
train_labels = train_partition[['FRAUD_NONFRAUD']]
test_labels = test_partition[['FRAUD_NONFRAUD']]

#train_tensors = torch.from_numpy(train.values).float().to(device)

print('Training Set Size: %d' % len(train_partition))
print('Testing Set Size: %d' % len(test_partition))

Training Set Size: 11200
Testing Set Size: 2800


# Create the Neural Network

Now we can go ahead and setup a Neural Network and its layers.

In [None]:
class NeuralNetwork(nn.Module):
  def __init__(self, number_fields, field_category_to_number):
    super(NeuralNetwork, self).__init__()

    # allocate the integer tensors
    self.number_fields = number_fields
    self.number_tensors = len(number_fields)

    # create the string embeddings
    self.field_category_to_number = field_category_to_number
    self.string_embeds = {} # field->embedding
    self.string_tensors = 0
    for field, categories in field_category_to_number.items():
      num_categories = len(categories)
      dimensions = int(math.sqrt(num_categories))
      self.string_embeds[field] = nn.Embedding(len(categories), dimensions)
      self.string_tensors += dimensions

    self.linear_relu_stack = nn.Sequential(
        nn.Linear(self.number_tensors + self.string_tensors, 16),
        nn.ReLU(),
        nn.Linear(16,8),
        nn.ReLU(),
        nn.Linear(8, 1), # 1 output column (aka boolean Fraud/NoFraud)
        nn.ReLU(),
    )
  
  def forward(self, x):
    logits = self.linear_relu_stack(x)
    return logits

  def generateTensor(self, row):
    string_tensor = self.generateStringTensor(row)
    number_tensor = self.generateNumberTensor(row)
    return torch.cat((number_tensor, string_tensor))

  def generateNumberTensor(self, row):
    tensor = None
    for field in self.number_fields:
      value = row[field]
      value_tensor = torch.tensor([value], dtype=torch.float32)
      #value_tensor = torch.nan_to_num(value_tensor) 
      tensor = value_tensor if tensor is None else torch.cat((tensor, value_tensor))
    return tensor


  def generateStringTensor(self, row):
    tensor = None
    for field, categories in self.field_category_to_number.items():
      value = row[field]
      lookup_tensor = torch.tensor(value, dtype=torch.long)
      field_tensor = self.string_embeds[field](lookup_tensor)
      tensor = field_tensor if tensor is None else torch.cat((tensor, field_tensor))
    return tensor

# Train the Model with the Training Set

In [None]:
model = NeuralNetwork(number_fields, field_category_to_number).to(device)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)

for epoch in range(20): # 10 epochs
  print(epoch)
  for index, row in train.iterrows():
    label = 1 if train_labels[output_column].loc[index] == output_labels[1] else 0
    label_tensor = torch.Tensor([label])
    tensors = model.generateTensor(row)

    model.zero_grad()
    output = model(tensors)
    loss = loss_fn(output, label_tensor)
    loss.backward()

    optimizer.step()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


# Test the Model against the Testing Set

In [None]:
tp = 0
fp = 0
tn = 0
fn = 0

for index, row in test.iterrows():
  label = 1 if test_labels[output_column].loc[index] == output_labels[1] else 0
  label_tensor = torch.Tensor([label])
  label_tensor = label_tensor.to(torch.bool).item()
  tensors = model.generateTensor(row)

  output = model(tensors).item()
  
  if output >= .5 and label_tensor:
    tp += 1
  elif output >= .5 and not label_tensor:
    fp += 1
  elif output < .5 and not label_tensor:
    tn += 1
  elif output < .5 and label_tensor:
    fn += 1
    
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = (tp) / (tp + fp)
recall = (tp) / (tp + fn)
f1 = 2 * (recall * precision) / (recall + precision)
print("TP:", tp, "FP:", fp, "TN:", tn, "FN:", fn)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

TP: 447 FP: 211 TN: 1784 FN: 358
Accuracy: 0.7967857142857143
Precision: 0.6793313069908815
Recall: 0.55527950310559
F1: 0.6110731373889268
