In [36]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.mixture import BayesianGaussianMixture
from sklearn import model_selection

import random
import math
from datetime import datetime

import pickle

In [2]:
#change this to either credit or ether; NOTE: credit not currently working for a few later functions
dataset = 'credit'

#NOTE: I added these to the gitignore
model_save_file = 'stored_models.pickle'
info_save_file = 'model_info.pickle'

In [68]:
if dataset == 'ether':
    df_transactions = pd.read_csv('archive/transaction_dataset.csv')

    #create a copy in which the fraud flag is enabled, for later reference
    df_transactions_copy = df_transactions.copy()

    #these rows are all useless/would cause bad outcomes
    df_transactions = df_transactions.drop(columns=['FLAG', 'Index'])

    transactions = df_transactions.to_numpy()

elif dataset == 'credit':
    df_transactions = pd.read_csv('archive/credit_card_transactions.csv')

    #create a copy in which the fraud flag is enabled, for later reference
    df_transactions_copy = df_transactions.copy()

    #these rows are all useless/would cause bad outcomes
    df_transactions = df_transactions.drop(columns=['is_fraud', 'first', 'last', 'city', 'street', 'state', 'trans_num'])

    transactions = df_transactions.to_numpy()

In [None]:
if dataset == 'credit':
    column_ids = df_transactions.columns

    unique_indices = {}
    unique_indices_values = {}

    X = np.ndarray(transactions.shape)

    zero_time = datetime.fromordinal(1)

    #this section is kind of abstract
    #for every element of the dataset
    for i in range(X.shape[0]):
        #for every component of that element
        for j in range(X.shape[1]):
            #if that component is a number, it stays as it is
            if isinstance(transactions[i, j], int) or isinstance(transactions[i, j], float):
                X[i, j] = transactions[i, j]
            
            #handle datetime as seconds since 0
            elif column_ids[j] == 'trans_date_trans_time':
                #convert string to datetime object
                dt = datetime.strptime(transactions[i, j], "%Y-%m-%d %H:%M:%S")
                #convert to timedelta object by subtracting the 0 time
                td = dt - zero_time

                X[i, j] = td.total_seconds()

            #handle dob as seconds since 0
            elif column_ids[j] == 'dob':
                #convert string to datetime object
                dt = datetime.strptime(transactions[i, j], "%Y-%m-%d")
                #convert to timedelta object by subtracting the 0 time
                td = dt - zero_time

                X[i, j] = td.total_seconds()

            #if it is not a number, assign an integer to each unique value which appears
            #ex, if name was a column, we might have 'Bob'=1, 'Cindy'=2, etc
            else:
                if not j in unique_indices:
                    unique_indices[j] = {}
                    unique_indices_values[j] = 0
                if not transactions[i, j] in unique_indices[j]:
                    unique_indices[j][transactions[i, j]] = unique_indices_values[j]
                    unique_indices_values[j] += 1
                X[i, j] = unique_indices[j][transactions[i, j]]
            
            #replace nan values with 0, since nan is not permissable in a BGM
            #might be wise to replace with something else?
            if math.isnan(X[i, j]):
                X[i, j] = 0
            
    #in the ether dataset, index 13 breaks the solver, for some reason; remove it
    excluded_indices = [13]
    included_indices = [i for i in range(X.shape[1]) if i not in excluded_indices]

    #I was pleasantly surprised numpy allowed this type of indexing
    X = X[:, included_indices]


In [None]:
if dataset == 'ether':

    unique_indices = {}
    unique_indices_values = {}

    X = np.ndarray(transactions.shape)

    #this section is kind of abstract
    #for every element of the dataset
    for i in range(X.shape[0]):
        #for every component of that element
        for j in range(X.shape[1]):
            #if that component is a number, it stays as it is
            if isinstance(transactions[i, j], int) or isinstance(transactions[i, j], float):
                X[i, j] = transactions[i, j]
            
            #if it is not a number, assign an integer to each unique value which appears
            #ex, if name was a column, we might have 'Bob'=1, 'Cindy'=2, etc
            else:
                if not j in unique_indices:
                    unique_indices[j] = {}
                    unique_indices_values[j] = 0
                if not transactions[i, j] in unique_indices[j]:
                    unique_indices[j][transactions[i, j]] = unique_indices_values[j]
                    unique_indices_values[j] += 1
                X[i, j] = unique_indices[j][transactions[i, j]]
            
            #replace nan values with 0, since nan is not permissable in a BGM
            #might be wise to replace with something else?
            if math.isnan(X[i, j]):
                X[i, j] = 0
    
    #in the ether dataset, index 22 breaks the solver, for some reason; remove it
    excluded_indices = [22]
    included_indices = [i for i in range(X.shape[1]) if i not in excluded_indices]

    #I was pleasantly surprised numpy allowed this type of indexing
    X = X[:, included_indices]


In [45]:
def get_measurements(log_probs:np.ndarray, transactions_df:pd.DataFrame, train_size:int=0, 
                     dynamic_epsilon:bool=True, 
                     score_function:None=lambda tps,fps,tns,fns: tps**1.3 - fps + tns - fns**1.3, 
                     epsilon:float=-200, test_range:int=400, test_step:int=5):
    '''
    gets the number of False Positives, True Positives, False Negatives, 
    True Negatives, and the best separating epsilon (if dynamic_epsilon=True)
    score_function is a maximization function
    '''

    # determine which epsilon gives the best results (if dynamic is on)
    best_score = 0
    if dynamic_epsilon:
        # for each test epsilon in range 
        # Max epsilon -> Max epsilon - test range (to prevent cases with insane length)
        for test_epsilon in range(int(max(log_probs)), int(max(log_probs))-test_range, -test_step):
            tps = 0
            fps = 0
            tns = 0
            fns = 0

            # for each probability
            for i in range(log_probs.shape[0]):
                #get whether or not a data point is fraud; NOTE: needs a rework
                if dataset == 'ether': 
                    flag = transactions_df.iloc[i + train_size]["FLAG"]
                elif dataset == 'credit': 
                    flag = transactions_df.iloc[i + train_size]['is_fraud']
                
                # if the element is marked as fraud, determine whether or not it is fraud
                if log_probs[i] < test_epsilon:
                    if flag == 1: tps += 1
                    else: fps += 1
                else:
                   if flag == 1: fns += 1
                   else: tns += 1 
            
            # if the score is the best, it becomes the best score
            fscore = score_function(tps, fps, tns, fns)
            if fscore > best_score:
                epsilon = test_epsilon
                best_score = fscore

    #initialize the return values
    fp, tp, fn, tn = 0,0,0,0
    flags = 0

    for i in range(log_probs.shape[0]):
        #determine whether or not the element is actually fraud
        #i + train_size gives the index in the training set. This will need to be fixed later
        if dataset == 'ether': 
            flag = transactions_df.iloc[i + train_size]["FLAG"]
        elif dataset == 'credit': 
            flag = transactions_df.iloc[i + train_size]['is_fraud']

        # if the element is not fraud
        if flag == 0:
            #if it is marked as fraud
            if log_probs[i] < epsilon: 
                #increment false positives
                fp += 1
            else: 
                #increment true negatives
                tn += 1
        #if the element is fraud
        if flag == 1:
            flags += 1
            #if it is marked as fraud
            if log_probs[i] < epsilon: 
                #increment true positives
                tp += 1
            else: 
                #increment false negatives
                fn += 1

    # keeping these for debug purposes
    # print(f'FP:{fp}, TP:{tp}, FN:{fn}, TN:{tn}, pos:{pos}, neg:{neg}, flags:{flags}')
    # print(f'correctly classified fraud:{tp}/{flags} ({tp/flags*100}%), incorrectly classified normal: {fp}/{(X_test.shape[0]-flags)} ({fp/(X_test.shape[0]-flags)*100}%)')
    # print(f'Normal Labels: {[i for i in sorted(zip(good_labels.keys(), good_labels.values()), key=lambda a: -a[1])]}')
    # print(f'Fraudulent Labels: {[i for i in sorted(zip(evil_labels.keys(), evil_labels.values()), key=lambda a: -a[1])]}')

    return fp, tp, fn, tn, epsilon

In [78]:
#this is kept separate so that I don't accidentally fuck up my experiments by running it
#use to reset the training
trained_models = {}
output_info = {}

In [17]:
#use try catch to prevent errors if the save files are not present

try:
    #if it exists, load the previously trained models
    model_input_file = open(model_save_file, 'rb')

    #store them back in the trained models dictionary
    unpickler = pickle.Unpickler(model_input_file)
    trained_models = unpickler.load()

    model_input_file.close()
except:
    pass

try:
    #if it exists, load the previously gathered data
    info_input_file = open(info_save_file, 'rb')

    #store them back in the output info dictionary
    unpickler = pickle.Unpickler(info_input_file)
    output_info = unpickler.load()

    info_input_file.close()
except:
    pass

In [79]:
default = BayesianGaussianMixture().get_params(0)

default['n_components'] = 10 #the defualt is 1, which is useless
default['weight_concentration_prior'] = 10 #the default is 0, which is bad
default['verbose'] = False #switch to true, to fill your screen with numbers
default['max_iter'] = 1000 #defaults to 100, but that often fails to converge
default['random_state'] = 0 #to keep experiments consistent

def bgm(parameters):
    '''Returns a BayesianGaussianMixture based on the parameters\n
    parameters can be either a key or a dictionary
    '''
    params = dict(parameters)
    comps = default

    for key in params:
        comps[key] = params[key]

    return BayesianGaussianMixture(covariance_prior=comps['covariance_prior'],
        covariance_type=comps['covariance_type'],
        degrees_of_freedom_prior=comps['degrees_of_freedom_prior'],
        init_params=comps['init_params'],
        max_iter=comps['max_iter'],
        mean_precision_prior=comps['mean_precision_prior'],
        mean_prior=comps['mean_prior'],
        n_components=comps['n_components'],
        n_init=comps['n_init'],
        random_state=comps['random_state'],
        reg_covar=comps['reg_covar'],
        tol=comps['tol'],
        verbose=comps['verbose'],
        verbose_interval=comps['verbose_interval'],
        warm_start=comps['warm_start'],
        weight_concentration_prior=comps['weight_concentration_prior'],
        weight_concentration_prior_type=comps['weight_concentration_prior_type'])

def key(dic, additional_params:dict={}):
    '''uses a parameter dictionary to save keys for easier lookup, from a dictionary\n
    Ex. trained_models[key({'tol':1.0, 'fold':2})] will give back the model with those parameters'''
    dictionary = dict(dic)
    for param in additional_params:
        dictionary[param] = additional_params[param]

    return frozenset(sorted(zip(dictionary.keys(), dictionary.values())))

In [86]:
#the models to be trained, as a list of keys
test_models = [key({'weight_concentration_prior':float(i), 'demonstration key (does nothing)':7}) for i in range(10, 21, 10)]

#makes the training set the first 5000 elements and the test set the last 4800
train_size = 5000
test_size = 5000
X_train = X[:train_size]
X_test = X[train_size:train_size+test_size]

#for each key
for model in test_models:
    #if the key is not already in the trained dictionary
    if not model in trained_models:
        #generate a model with parameters that match the key
        m = bgm(model)
        
        #fit the model to the training data
        m.fit(X_train)

        #print the model so that you don't have to worry about whether or not the code is working
        print(model)

        #save the model to the trained models
        trained_models[model] = m
    
    # if the key is not already in the data dictionary
    if not model in output_info:
        #generate the output info, as a labeled dictionary
        info = {}
        info['labels'] = trained_models[model].predict(X_test)
        info['probs'] = trained_models[model].predict_proba(X_test)
        info['log_probs'] = trained_models[model].score_samples(X_test)
        info['scores'] = get_measurements(info['log_probs'], df_transactions_copy, train_size, score_function=lambda tps, fps, tns, fns: tps**2 - fps)

        #set this last, so that data isn't saved with only half the work done
        output_info[model] = info


frozenset({('demonstration key', 7), ('weight_concentration_prior', 10.0)})
frozenset({('demonstration key', 7), ('weight_concentration_prior', 20.0)})


In [11]:
#NOTE: this overwrites the previous save files 
# (this is negated by making sure that the earlier block which adds the save file back into memory
# was run)

#save the trained models
model_output_file = open(model_save_file, 'wb')
models_pickler = pickle.Pickler(model_output_file)
models_pickler.dump(trained_models)
model_output_file.close()

#save the test info
info_output_file = open(info_save_file, 'wb')
info_pickler = pickle.Pickler(info_output_file)
info_pickler.dump(output_info)
info_output_file.close()

In [90]:
#print out the stats, for the benefit of everyone who is not a computer
for model in output_info:

    fp, tp, fn, tn, epsilon = output_info[model]['scores']
    #number of transactions marked as suspicious
    pos = fp + tp
    #number of transactions ignored
    neg = fn + tn
    #number of fraudulent transactions
    flags = tp + fn

    print(dict(model))
    print(f'FP:{fp}, TP:{tp}, FN:{fn}, TN:{tn}, pos:{pos}, neg:{neg}, flags:{flags}')
    print(f'correctly classified fraud:{tp}/{flags} ({tp/flags*100}%)')
    print(f'incorrectly classified normal: {fp}/{(output_info[model]['log_probs'].shape[0]-flags)} ({fp/(output_info[model]['log_probs'].shape[0]-flags)*100}%)')
    print()

{'weight_concentration_prior': 10.0}
FP:30, TP:16, FN:9, TN:4945, pos:46, neg:4954, flags:25
correctly classified fraud:16/25 (64.0%)
incorrectly classified normal: 30/4975 (0.6030150753768844%)

{'weight_concentration_prior': 20.0}
FP:30, TP:16, FN:9, TN:4945, pos:46, neg:4954, flags:25
correctly classified fraud:16/25 (64.0%)
incorrectly classified normal: 30/4975 (0.6030150753768844%)

{'demonstration key': 7, 'weight_concentration_prior': 10.0}
FP:82, TP:21, FN:4, TN:4893, pos:103, neg:4897, flags:25
correctly classified fraud:21/25 (84.0%)
incorrectly classified normal: 82/4975 (1.6482412060301508%)

{'demonstration key': 7, 'weight_concentration_prior': 20.0}
FP:82, TP:21, FN:4, TN:4893, pos:103, neg:4897, flags:25
correctly classified fraud:21/25 (84.0%)
incorrectly classified normal: 82/4975 (1.6482412060301508%)

