In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.mixture import BayesianGaussianMixture
from sklearn import model_selection

import random
import math
from datetime import datetime

import pickle

In [None]:
#NOTE: I added these to the gitignore
model_save_file = 'stored_models.pickle'
info_save_file = 'model_info.pickle'

In [None]:
df_transactions = pd.read_csv('archive/credit_card_transactions.csv')

#create a copy in which the fraud flag is enabled, for later reference
df_transactions_copy = df_transactions.copy()

#these rows are all useless/would cause bad outcomes
# df_transactions = df_transactions.drop(columns=['is_fraud', 'first', 'last', 'city', 'street', 'state', 'trans_num', 'trans_date_trans_time', 'cc_num', 'merchant', 'lat', 'long', 'city_pop', 'job', 'dob', 'merch_long', 'merch_zipcode'])
df_transactions = df_transactions.drop(columns=['Unnamed: 0', 'is_fraud', 'first', 'last', 'city', 'street', 'state', 'zip', 'trans_num', 'unix_time', 'trans_date_trans_time', 'cc_num', 'merchant', 'lat', 'long', 'city_pop', 'job', 'dob', 'merch_lat', 'merch_long', 'merch_zipcode'])

transactions = df_transactions.to_numpy()

In [4]:
print(df_transactions.columns)

Index(['category', 'amt', 'gender'], dtype='object')


In [None]:
column_ids = df_transactions.columns

unique_indices = {}
unique_indices_values = {}

X = np.ndarray(transactions.shape)

zero_time = datetime.fromordinal(1)

#this section is kind of abstract
#for every element of the dataset
for i in range(X.shape[0]):
    #for every component of that element
    for j in range(X.shape[1]):
        #if that component is a number, it stays as it is
        if isinstance(transactions[i, j], int) or isinstance(transactions[i, j], float):
            X[i, j] = transactions[i, j]
        
        #handle datetime as seconds since 0
        elif column_ids[j] == 'trans_date_trans_time':
            #convert string to datetime object
            dt = datetime.strptime(transactions[i, j], "%Y-%m-%d %H:%M:%S")
            #convert to timedelta object by subtracting the 0 time
            td = dt - zero_time

            X[i, j] = td.total_seconds()

        #handle dob as seconds since 0
        elif column_ids[j] == 'dob':
            #convert string to datetime object
            dt = datetime.strptime(transactions[i, j], "%Y-%m-%d")
            #convert to timedelta object by subtracting the 0 time
            td = dt - zero_time

            X[i, j] = td.total_seconds()

        #if it is not a number, assign an integer to each unique value which appears
        #ex, if name was a column, we might have 'Bob'=1, 'Cindy'=2, etc
        else:
            if not j in unique_indices:
                unique_indices[j] = {}
                unique_indices_values[j] = 0
            if not transactions[i, j] in unique_indices[j]:
                unique_indices[j][transactions[i, j]] = unique_indices_values[j]
                unique_indices_values[j] += 1
            X[i, j] = unique_indices[j][transactions[i, j]]
        
        #replace nan values with 0, since nan is not permissable in a BGM
        #might be wise to replace with something else?
        if math.isnan(X[i, j]):
            X[i, j] = 0

#separate the final test set
final_test_size = 10000
final_test_indices = [i for i in range(X.shape[0] - final_test_size, X.shape[0])]
X_test_final = X[final_test_indices]
X = X[:-final_test_size]


In [20]:
#this is kept separate so that I don't accidentally fuck up my experiments by running it
#use to reset the training
trained_models = {}
output_info = {}

In [21]:
#use try catch to prevent errors if the save files are not present

try:
    #if it exists, load the previously trained models
    model_input_file = open(model_save_file, 'rb')

    #store them back in the trained models dictionary
    unpickler = pickle.Unpickler(model_input_file)
    trained_models = unpickler.load()

    model_input_file.close()
except:
    pass

try:
    #if it exists, load the previously gathered data
    info_input_file = open(info_save_file, 'rb')

    #store them back in the output info dictionary
    unpickler = pickle.Unpickler(info_input_file)
    output_info = unpickler.load()

    info_input_file.close()
except:
    pass

In [22]:
default = BayesianGaussianMixture().get_params(0)

default['n_components'] = 10 #the defualt is 1, which is useless
default['weight_concentration_prior'] = 10 #the default is 0, which is bad
default['verbose'] = False #switch to true, to fill your screen with numbers
default['max_iter'] = 1000 #defaults to 100, but that often fails to converge
default['random_state'] = 0 #to keep experiments consistent

def bgm(parameters):
    '''Returns a BayesianGaussianMixture based on the parameters\n
    parameters can be either a key or a dictionary
    '''
    params = dict(parameters)
    comps = default

    for key in params:
        comps[key] = params[key]

    return BayesianGaussianMixture(covariance_prior=comps['covariance_prior'],
        covariance_type=comps['covariance_type'],
        degrees_of_freedom_prior=comps['degrees_of_freedom_prior'],
        init_params=comps['init_params'],
        max_iter=comps['max_iter'],
        mean_precision_prior=comps['mean_precision_prior'],
        mean_prior=comps['mean_prior'],
        n_components=comps['n_components'],
        n_init=comps['n_init'],
        random_state=comps['random_state'],
        reg_covar=comps['reg_covar'],
        tol=comps['tol'],
        verbose=comps['verbose'],
        verbose_interval=comps['verbose_interval'],
        warm_start=comps['warm_start'],
        weight_concentration_prior=comps['weight_concentration_prior'],
        weight_concentration_prior_type=comps['weight_concentration_prior_type'])

def key(dic, additional_params:dict={}):
    '''uses a parameter dictionary to save keys for easier lookup, from a dictionary\n
    Ex. trained_models[key({'tol':1.0, 'fold':2})] will give back the model with those parameters'''
    dictionary = dict(dic)
    for param in additional_params:
        dictionary[param] = additional_params[param]

    return frozenset(sorted(zip(dictionary.keys(), dictionary.values())))

In [None]:
#dictionary of model weights
model_weights = {}

#lower bound on true positive values to be accepted (as a percentage)
tp_threshold = 0 #0.75

#upper bound on false positive values to be accepted (as a percentage)
fp_threshold = 1.0 #0.06

#for each key to the trained models dictionary
for model in trained_models:
    #get the model's stats
    fp, tp, fn, tn, epsilon = output_info[model]['scores']

    #calculate the percentage of true positives
    tp_percent = tp/(tp + fn)

    #calculate the percentage of false positives
    fp_percent = (fp + 1)/(output_info[model]['log_probs'].shape[0]-(tp + fn))

    #if the percentages are within the threshold values
    if tp_percent >= tp_threshold and fp_percent <= fp_threshold:
        #make an entry in the model weights dictionary using the model weight function TODO: Pick good function
        model_weights[model] = tp_percent - fp_percent
    #if it is not within the threshold
    else:
        #this is largely redundant, but included to avoid any possible errors
        model_weights[model] = 0

#display the weights
print(f'Model Weights')
for model in model_weights:
    print(model, model_weights[model])

Model Weights
frozenset({('n_components', 1)}) 0.46199999999999997
frozenset({('n_components', 2)}) 0.685
frozenset({('n_components', 3)}) 0.695
frozenset({('n_components', 4)}) 0.7
frozenset({('n_components', 5)}) 0.7
frozenset({('n_components', 6)}) 0.695
frozenset({('n_components', 7)}) 0.695
frozenset({('n_components', 8)}) 0.701
frozenset({('n_components', 9)}) 0.701
frozenset({('n_components', 10)}) 0.701


In [None]:
def classify(data):
    '''
    classifies the data, by taking the weighted average among the models
    returns a normalized float value representing the average conjecture
    '''
    #construct a np array representing the keys for the model weights, to allow numerical indexing
    models = np.array(list(model_weights.keys()))

    #calculate the sum of all weights, dividing by this allows for arbitrary weighting functions
    normalization_factor = sum(model_weights.values())

    #create the classifications array (holds all classifications by all models)
    classifications = np.ndarray((models.shape[0], data.shape[0]))

    #for each model
    for i in range(models.shape[0]):
        #get the model
        trained_model = trained_models[models[i]]

        #calculate the log likelyhoods
        log_likelyhoods = trained_model.score_samples(data) 

        #assign the log likelyhoods above the model's epsilon value to be ligitimate and apply weight
        classifications[i, log_likelyhoods >= output_info[models[i]]['scores'][4]] = 0 * model_weights[models[i]]

        #assign the log likelyhoods below the model's epsilon value to be fraudulent and apply weight
        classifications[i, log_likelyhoods < output_info[models[i]]['scores'][4]] = 1 * model_weights[models[i]]
    
    #take the sum of the classifications for each data point, normalize
    final_classification = np.sum(classifications, axis=0)/normalization_factor
    return final_classification

In [None]:
#create an empty array to hold the fraudulence values for the final test set
flag_indices = np.ndarray((X_test_final.shape[0]), dtype=int)

#for each position in the final test set
for i in range(flag_indices.shape[0]):
    #assign it the value of the appropriate entry
    flag_indices[i] = df_transactions_copy.iloc[final_test_indices[i]]["is_fraud"]

In [139]:
#create an empty array to hold the indices from the test set which are classified as fraud
fraud_result_indices = []

#the cutoff point for being marked as fraud (lower values allow more fraud, higer values require more strict agreement)
fraud_cutoff = 0.5

#get the classifications for the final test set
clas = classify(X_test_final)

#for each classification
for i in range(clas.shape[0]):
    #if the classification is past the cutoff
    if clas[i] != 0 and clas[i] >= fraud_cutoff:
        #add the index to the result indices array
        fraud_result_indices.append(i)

#initialize the false positive and true positive values
fp = 0
tp = 0

#for each index marked as fraud
for i in range(len(fraud_result_indices)):
    #if that index is actually fraud
    if flag_indices[fraud_result_indices[i]] == 1:
        #increment true positives
        tp += 1
    #else increment false positives
    else: fp += 1

#print out the true positive and false positive data
print(f'tp: {tp}/132 ({tp/132 * 100}%), fp: {fp}/9868 ({fp/9868 * 100}%)')

tp: 83/132 (62.878787878787875%), fp: 224/9868 (2.2699635184434537%)


In [141]:
#print out the stats, for the benefit of everyone who is not a computer
for model in model_weights:

    fp, tp, fn, tn, epsilon = output_info[model]['scores']
    #number of transactions marked as suspicious
    pos = fp + tp
    #number of transactions ignored
    neg = fn + tn
    #number of fraudulent transactions
    flags = tp + fn

    #human readible output
    print(dict(model), f'weight: {model_weights[model]}')
    print(f'FP:{fp}, TP:{tp}, FN:{fn}, TN:{tn}, pos:{pos}, neg:{neg}, flags:{flags}')
    print(f'correctly classified fraud:{tp}/{flags} ({tp/flags*100}%)')
    print(f'incorrectly classified normal: {fp}/{(output_info[model]['log_probs'].shape[0]-flags)} ({fp/(output_info[model]['log_probs'].shape[0]-flags)*100}%)')

    #check to make sure the output was generated with the final test set included
    if 'test_scores' in output_info[model]:
        fp, tp, fn, tn, epsilon = output_info[model]['test_scores']
        #number of transactions marked as suspicious
        pos = fp + tp
        #number of transactions ignored
        neg = fn + tn
        #number of fraudulent transactions
        flags = tp + fn

        #human readible output
        print(f'*Test Set*')
        print(f'FP:{fp}, TP:{tp}, FN:{fn}, TN:{tn}, pos:{pos}, neg:{neg}, flags:{flags}')
        print(f'correctly classified fraud:{tp}/{flags} ({tp/flags*100}%)')
        print(f'incorrectly classified normal: {fp}/{(output_info[model]['test_log_probs'].shape[0]-flags)} ({fp/(output_info[model]['test_log_probs'].shape[0]-flags)*100}%)')
        print()
    else:
        print(f'please delete {info_save_file} and rerun the generator; the output file is incomplete')

{'n_components': 1} weight: 0.46199999999999997
FP:7, TP:235, FN:265, TN:993, pos:242, neg:1258, flags:500
correctly classified fraud:235/500 (47.0%)
incorrectly classified normal: 7/1000 (0.7000000000000001%)
*Test Set*
FP:124, TP:54, FN:78, TN:9744, pos:178, neg:9822, flags:132
correctly classified fraud:54/132 (40.909090909090914%)
incorrectly classified normal: 124/9868 (1.256586947709769%)

{'n_components': 2} weight: 0.685
FP:64, TP:375, FN:125, TN:936, pos:439, neg:1061, flags:500
correctly classified fraud:375/500 (75.0%)
incorrectly classified normal: 64/1000 (6.4%)
*Test Set*
FP:661, TP:91, FN:41, TN:9207, pos:752, neg:9248, flags:132
correctly classified fraud:91/132 (68.93939393939394%)
incorrectly classified normal: 661/9868 (6.698419132549656%)

{'n_components': 3} weight: 0.695
FP:22, TP:359, FN:141, TN:978, pos:381, neg:1119, flags:500
correctly classified fraud:359/500 (71.8%)
incorrectly classified normal: 22/1000 (2.1999999999999997%)
*Test Set*
FP:224, TP:83, FN:49,