In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.mixture import BayesianGaussianMixture
from sklearn import model_selection

import random
import math
import datetime

import pickle

In [2]:
#change this to either ibm or ether; NOTE: ibm not currently working for a few later functions
dataset = 'ether'

#NOTE: I added these to the gitignore
model_save_file = 'stored_models.pickle'
info_save_file = 'model_info.pickle'

In [3]:
if dataset == 'ibm':
    df_transactions = pd.read_csv("archive/User0_credit_card_transactions.csv")
    df_cards = pd.read_csv("archive/sd254_cards.csv")
    df_users = pd.read_csv("archive/sd254_users.csv")

    #create a copy in which the fraud flag is enabled, for later reference
    df_transactions_copy = df_transactions.copy()

    # get rid of the fraud column to prevent really stupid and embarrasing outcomes
    df_transactions = df_transactions.drop(columns=["Is Fraud?"])

    #these are used when converting to sample vectors
    transactions = df_transactions.to_numpy()
    cards = df_cards.to_numpy()
    users = df_users.to_numpy()

if dataset == 'ether':
    df_transactions = pd.read_csv('archive/transaction_dataset.csv')

    #create a copy in which the fraud flag is enabled, for later reference
    df_transactions_copy = df_transactions.copy()

    #these rows are all useless/would cause bad outcomes
    df_transactions = df_transactions.drop(columns=['Unnamed: 0', 'FLAG', 'Index'])

    transactions = df_transactions.to_numpy()

In [4]:
#NOTE: IGNORE THIS BLOCK, I AMO GOING TO REWRITE IT SOON
if dataset == "ibm":

    #need to convert the CSV text values into numbers

    #transaction data
    num_transaction_types = 0
    num_merchant_cities = 0
    num_merchant_states = 0
    num_errors = 0

    #user data
    num_genders = 0
    num_user_cities = 0
    num_user_states = 0

    #card data
    num_brands = 0
    num_card_types = 0

    #transaction data
    transaction_types = {}
    merchant_cities = {}
    merchant_states = {}
    errors = {}

    #user data
    genders = {}
    user_cities = {}
    user_states = {}

    #card data
    card_brands = {}
    card_types = {}

    zero_time = datetime.datetime.strptime("00:00", "%H:%M")

    X = np.ndarray((transactions.shape[0], transactions.shape[1] + df_cards.shape[1] + df_users.shape[1]), dtype=float)
    X_reference = np.ndarray(X.shape, dtype=object)

    for i in range(transactions.shape[0]):
        user = transactions[i, 0]
        card = transactions[i, 1]

        card_data = df_cards[(df_cards['User'] == user) & (df_cards['CARD INDEX'] == card)]

        #these values are already numbers; no need to change them
        #TODO: normalize some of them?
        X[i, 0:5] = transactions[i, 0:5]
        X[i, 8] = transactions[i, 8]
        X[i, 11:13] = transactions[i, 11:13]

        X_reference[i, :transactions.shape[1]] = transactions[i, :]

        #convert the time of date string into the number of minutes past midnight
        dtime = datetime.datetime.strptime(transactions[i, 5], "%H:%M")
        X[i, 5] = (dtime - zero_time).seconds//60

        #convert dollar amount from "$134.03" to 134.03
        X[i, 6] = float(transactions[i, 6][1:])

        #use ints to represent the different transaction types
        if not transactions[i, 7] in transaction_types:
            transaction_types[transactions[i, 7]] = num_transaction_types
            num_transaction_types += 1
        X[i, 7] = transaction_types[transactions[i, 7]]

        #use ints to represent the different cities
        if not transactions[i, 9] in merchant_cities:
            merchant_cities[transactions[i, 9]] = num_merchant_cities
            num_merchant_cities += 1
        X[i, 9] = merchant_cities[transactions[i, 9]]

        #use ints to represent the different states
        if not transactions[i, 10] in merchant_states:
            merchant_states[transactions[i, 10]] = num_merchant_states
            num_merchant_states += 1
        X[i, 10] = merchant_states[transactions[i, 10]]

        #use ints to represent errors
        if not transactions[i, 13] in errors:
            errors[transactions[i, 13]] = num_errors
            num_errors += 1
        X[i, 13] = 0 #errors[transactions[i, 13]]
        #I disabled this because it caused too many outliers

        X[i, 14:18] = users[user, 1:5] 
        X_reference[i, transactions.shape[1]:transactions.shape[1] + users.shape[1] - 1] = users[user, 1:]

        #use ints to represent gender 
        #(pretty sure the csv only has male/female, but don't want to cause any issues)
        if not users[user, 5] in genders:
            genders[users[user, 5]] = num_genders
            num_genders += 1
        X[i, 18] = genders[users[user, 5]]

        #evey address is unique, I'm just going to leave it blank
        X[i, 19] = 0

        #appartment; probably not relevant
        X[i, 20] = users[user, 7]

        #use ints to represent cities
        if not users[user, 8] in user_cities:
            user_cities[users[user, 8]] = num_user_cities
            num_user_cities += 1
        X[i, 21] = user_cities[users[user, 8]]

        #use ints to represent states
        if not users[user, 9] in user_states:
            user_states[users[user, 9]] = num_user_states
            num_user_states += 1
        X[i, 22] = user_states[users[user, 9]]

        #ZIP, lat, long
        X[i, 23:26] = users[user, 10:13]

        #per capita - ZIP
        X[i, 26] = float(users[user, 13][1:])

        #yearly income
        X[i, 27] = float(users[user, 14][1:])

        #debt
        X[i, 28] = float(users[user, 15][1:])

        #FICO, Num credit cards
        X[i, 29:31] = users[user, 16:19]

        card_data_np = card_data.to_numpy()
        X_reference[i, transactions.shape[1] + users.shape[1] - 1:transactions.shape[1] + users.shape[1] + card_data_np.shape[1] - 3] = card_data_np[0][2:]
        #turns out I could have been using dataframes, rather than np
        if i == 5011: print(i, user, card_data['Card Brand'])
        if not card_data['Card Brand'][card] in card_brands:
            card_brands[card_data['Card Brand'][card]] = num_brands
            num_brands += 1
        X[i, 31] = card_brands[card_data['Card Brand'][card]]

        #use ints to represent card types
        if not card_data['Card Type'][card] in card_types:
            card_types[card_data['Card Type'][card]] = num_card_types
            num_card_types += 1
        X[i, 32] = card_types[card_data['Card Type'][card]]

        X[i, 33] = card_data['Card Number'][card]

        #read expiry date as months
        X[i, 34] = float(card_data['Expires'][card][0:2]) + float(card_data['Expires'][card][3:8]) * 12
        
        X[i, 35] = card_data['CVV'][card]

        X[i, 36] = 0 if card_data['Has Chip'][card] == 'No' else 1

        X[i, 37] = card_data['Cards Issued'][card]

        X[i, 38] = float(card_data["Credit Limit"][card][1:])

        X[i, 39] = float(card_data['Acct Open Date'][card][0:2]) + float(card_data['Acct Open Date'][card][3:8]) * 12

        X[i, 40] = card_data['Year PIN last Changed'][card]

        X[i, 41] = 0 if card_data['Card on Dark Web'][card] == 'No' else 1

        #generate extra information section

        #time since last transaction (0 for first transatciont)
        if i > 0 and df_transactions['User'][i] == df_transactions['User'][i-1]:
            transaction_time = datetime.datetime(df_transactions["Year"][i], df_transactions["Month"][i], df_transactions["Day"][i], int(df_transactions["Time"][i][0:2]), int(df_transactions['Time'][i][3:]))
            last_transaction_time = datetime.datetime(df_transactions["Year"][i - 1], df_transactions["Month"][i - 1], df_transactions["Day"][i - 1], int(df_transactions["Time"][i - 1][0:2]), int(df_transactions['Time'][i - 1][3:]))
            relative_time = transaction_time - last_transaction_time
            # print(relative_time)
            X[i, 42] = relative_time.total_seconds()
        X_reference[i, 42] = float(X[i, 42])

        for j in range(X.shape[1]):
            if math.isnan(X[i, j]): X[i, j] = 0

        

        # print(transactions[i])    
        # print(X[i], X[i, 11], transactions[i, 11])

    for i in range(X.shape[0]):
        print(X[i])
        print(X_reference[i])
    # print(X[0])

In [5]:
if dataset == 'ether':

    unique_indices = {}
    unique_indices_values = {}

    X = np.ndarray(transactions.shape)

    #this section is kind of abstract
    #for every element of the dataset
    for i in range(X.shape[0]):
        #for every component of that element
        for j in range(X.shape[1]):
            #if that component is a number, it stays as it is
            if isinstance(transactions[i, j], int) or isinstance(transactions[i, j], float):
                X[i, j] = transactions[i, j]
            
            #if it is not a number, assign an integer to each unique value which appears
            #ex, if name was a column, we might have 'Bob'=1, 'Cindy'=2, etc
            else:
                if not j in unique_indices:
                    unique_indices[j] = {}
                    unique_indices_values[j] = 0
                if not transactions[i, j] in unique_indices[j]:
                    unique_indices[j][transactions[i, j]] = unique_indices_values[j]
                    unique_indices_values[j] += 1
                X[i, j] = unique_indices[j][transactions[i, j]]
            
            #replace nan values with 0, since nan is not permissable in a BGM
            #might be wise to replace with something else?
            if math.isnan(X[i, j]):
                X[i, j] = 0
    
    #in the ether dataset, index 22 breaks the solver, for some reason; remove it
    excluded_indices = [22]
    included_indices = [i for i in range(X.shape[1]) if i not in excluded_indices]

    #I was pleasantly surprised numpy allowed this type of indexing
    X = X[:, included_indices]


In [6]:
def get_measurements(log_probs:np.ndarray, transactions_df:pd.DataFrame, train_size:int=0, 
                     dynamic_epsilon:bool=True, 
                     score_function:None=lambda tps,fps,tns,fns: tps**1.3 - fps + tns - fns**1.3, 
                     epsilon:float=-200, test_range:int=400, test_step:int=5):
    '''
    gets the number of False Positives, True Positives, False Negatives, 
    True Negatives, and the best epsilon (if dynamic_epsilon=True)
    '''

    # determine which epsilon gives the best results (if dynamic is on)
    best_score = 0
    if dynamic_epsilon:
        # for each test epsilon in range 
        # Max epsilon -> Max epsilon - test range (to prevent cases with insane length)
        for test_epsilon in range(int(max(log_probs)), int(max(log_probs))-test_range, -test_step):
            tps = 0
            fps = 0
            tns = 0
            fns = 0

            # for each probability
            for i in range(log_probs.shape[0]):
                #get whether or not a data point is fraud; NOTE: needs a rework
                flag = transactions_df.iloc[i + train_size]["FLAG"]
                
                # if the element is marked as fraud, determine whether or not it is fraud
                if log_probs[i] < test_epsilon:
                    if flag == 1: tps += 1
                    else: fps += 1
                else:
                   if flag == 1: fns += 1
                   else: tns += 1 
            
            # if the score is the best, it becomes the best score
            fscore = score_function(tps, fps, tns, fns)
            if fscore > best_score:
                epsilon = test_epsilon
                best_score = fscore

    #initialize the return values
    fp, tp, fn, tn = 0,0,0,0
    flags = 0

    for i in range(log_probs.shape[0]):
        #determine whether or not the element is actually fraud
        #i + train_size gives the index in the training set. This will need to be fixed later
        flag = transactions_df.iloc[i + train_size]["FLAG"]

        # if the element is not fraud
        if flag == 0:
            #if it is marked as fraud
            if log_probs[i] < epsilon: 
                #increment false positives
                fp += 1
            else: 
                #increment true negatives
                tn += 1
        #if the element is fraud
        if flag == 1:
            flags += 1
            #if it is marked as fraud
            if log_probs[i] < epsilon: 
                #increment true positives
                tp += 1
            else: 
                #increment false negatives
                fn += 1

    # keeping these for debug purposes
    # print(f'FP:{fp}, TP:{tp}, FN:{fn}, TN:{tn}, pos:{pos}, neg:{neg}, flags:{flags}')
    # print(f'correctly classified fraud:{tp}/{flags} ({tp/flags*100}%), incorrectly classified normal: {fp}/{(X_test.shape[0]-flags)} ({fp/(X_test.shape[0]-flags)*100}%)')
    # print(f'Normal Labels: {[i for i in sorted(zip(good_labels.keys(), good_labels.values()), key=lambda a: -a[1])]}')
    # print(f'Fraudulent Labels: {[i for i in sorted(zip(evil_labels.keys(), evil_labels.values()), key=lambda a: -a[1])]}')

    return fp, tp, fn, tn, epsilon

In [7]:
#this is kept separate so that I don't accidentally fuck up my experiments by running it
#use to reset the training
trained_models = {}
output_info = {}

In [8]:
#use try catch to prevent errors if the save files are not present

try:
    #if it exists, load the previously trained models
    model_input_file = open(model_save_file, 'rb')

    #store them back in the trained models dictionary
    unpickler = pickle.Unpickler(model_input_file)
    trained_models = unpickler.load()

    model_input_file.close()
except:
    pass

try:
    #if it exists, load the previously gathered data
    info_input_file = open(info_save_file, 'rb')

    #store them back in the output info dictionary
    unpickler = pickle.Unpickler(info_input_file)
    output_info = unpickler.load()

    info_input_file.close()
except:
    pass

In [9]:
default = BayesianGaussianMixture().get_params(0)

default['n_components'] = 10 #the defualt is 1, which is useless
default['weight_concentration_prior'] = 10 #the default is 0, which is bad
default['verbose'] = False #switch to true, to fill your screen with numbers
default['max_iter'] = 1000 #defaults to 100, but that often fails to converge
default['random_state'] = 0 #to keep experiments consistent

def bgm(parameters):
    '''Returns a BayesianGaussianMixture based on the parameters\n
    parameters can be either a key or a dictionary
    '''
    params = dict(parameters)
    comps = default

    for key in params:
        comps[key] = params[key]

    return BayesianGaussianMixture(covariance_prior=comps['covariance_prior'],
        covariance_type=comps['covariance_type'],
        degrees_of_freedom_prior=comps['degrees_of_freedom_prior'],
        init_params=comps['init_params'],
        max_iter=comps['max_iter'],
        mean_precision_prior=comps['mean_precision_prior'],
        mean_prior=comps['mean_prior'],
        n_components=comps['n_components'],
        n_init=comps['n_init'],
        random_state=comps['random_state'],
        reg_covar=comps['reg_covar'],
        tol=comps['tol'],
        verbose=comps['verbose'],
        verbose_interval=comps['verbose_interval'],
        warm_start=comps['warm_start'],
        weight_concentration_prior=comps['weight_concentration_prior'],
        weight_concentration_prior_type=comps['weight_concentration_prior_type'])

def key(dic, additional_params:dict={}):
    '''uses a parameter dictionary to save keys for easier lookup, from a dictionary\n
    Ex. trained_models[key({'tol':1.0, 'fold':2})] will give back the model with those parameters'''
    dictionary = dict(dic)
    for param in additional_params:
        dictionary[param] = additional_params[param]

    return frozenset(sorted(zip(dictionary.keys(), dictionary.values())))

In [10]:
#the models to be trained, as a list of keys
test_models = [key({'weight_concentration_prior':float(i)}) for i in range(10, 20, 5)]

#makes the training set the first 5000 elements and the test set the last 4800
train_size = 5000
X_train = X[:train_size]
X_test = X[train_size:]

#for each key
for model in test_models:
    #if the key is not already in the trained dictionary
    if not model in trained_models:
        #generate a model with parameters that match the key
        m = bgm(model)
        
        #fit the model to the training data
        m.fit(X_train)

        #print the model so that you don't have to worry about whether or not the code is working
        print(model)

        #save the model to the trained models
        trained_models[model] = m
    
    # if the key is not already in the data dictionary
    if not model in output_info:
        #generate the output info, as a labeled dictionary
        output_info[model] = {}
        output_info[model]['labels'] = trained_models[model].predict(X_test)
        output_info[model]['probs'] = trained_models[model].predict_proba(X_test)
        output_info[model]['log_probs'] = trained_models[model].score_samples(X_test)
        output_info[model]['scores'] = get_measurements(output_info[model]['log_probs'], df_transactions_copy, train_size)


In [11]:
#NOTE: this overwrites the previous save files 
# (this is negated by making sure that the earlier block which adds the save file back into memory
# was run)

#save the trained models
model_output_file = open(model_save_file, 'wb')
models_pickler = pickle.Pickler(model_output_file)
models_pickler.dump(trained_models)
model_output_file.close()

#save the test info
info_output_file = open(info_save_file, 'wb')
info_pickler = pickle.Pickler(info_output_file)
info_pickler.dump(output_info)
info_output_file.close()