In [1]:
# Data Handeling
import pandas as pd
import numpy as np
import time
from copy import deepcopy

# Data Exploration
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import seaborn as sns
import matplotlib.pyplot as plt

#random
import random

# Classification Models
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

# Pre processing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector

# Metric Calculations
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

# Synthetic data generation library
# import sdv

# Optimization
from sklearn.model_selection import GridSearchCV

# Sampling Libraries
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Concept Drift Detection tools
from sklearn.cluster import KMeans
from scipy import stats
from scipy.spatial import distance

# Custom Synthetic Data Creation Functions using SDV
import synth_data_lib_v2_extra_drift

# supress warning outputs
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Group 2 Bank Fraud Detection
# Stephen Montgomery (smontg12@depaul.edu)
# John Bolgert (jbolgert@depaul.edu)

In [2]:
df = pd.read_csv('raw.csv')

# Pre Processing 

### Obtain median cost of Fraud

In [3]:
# store median fraud to be used later in optimization as the cost of FN
median_fraud = df[(df['fraud_bool']==1)&(df['intended_balcon_amount']>0)]['intended_balcon_amount'].median()
print('The Meidan Fraud Transfer amount is :', median_fraud)

The Meidan Fraud Transfer amount is : 34.70264688784764


In [4]:
# use months 7 for base data and 6 for adding in noise each day of the simulation
df_use = df[(df['month']==7) | (df['month']==6)]

### Null Values

### Addressing null values

In [5]:
### Discretize each feature that has greater than 10% missing values for fraud 
features = list(df_use.columns)
total_rows = len(df_use)
total_fraud = len(df_use[df_use['fraud_bool']==1])

# Numeric features with less than 25 unqiue values in the base data get discretized into bins if less than 5 they are
# directly converted using their assoicated number if larger than they get binned into small large based on median value
for col in features:
    bins = []
    if is_numeric_dtype(df_use[col]):
        if col == 'fraud_bool' or col=='month' :
            continue
        if len(df_use[col].unique())<5 or col=='month':
            if len(df_use[df_use[col]<0])>0:
                print(col, ' convert numeric to categorical feature')
                total_null = len(df_use[df_use[col]<0])
                fraud_null = len(df_use[(df_use[col]<0) & (df_use['fraud_bool']==1)])
                df_use[col]=df_use[col].astype(str)
        elif len(df_use[col].unique())<25:
            print(col, ' discretize feature into 2 bins')
            # Discretize the feature
            med_val = df_use[col].median()
            min_val = df_use[col].min()
            max_val = df_use[col].max()
            bin_labels = ['Small','Large']
            
            bins = [min_val-1, med_val, max_val]
            df_use[col] = pd.cut(df_use[col], bins=bins, labels=bin_labels)
            
        elif len(df_use[col].unique())<100:
            # if less than 100 unqiue vlaues then value gets discretized into 5 bins
            print(col, ' discretize feature into 5 bins')
            # Discretize the feature
            med_val = df_use[col].median()
            min_val = df_use[col].min()
            max_val = df_use[col].max()
            bin_labels = ['bin1','bin2','bin3','bin4','bin5']
            
            for i in range(6):
                if i == 0:
                    bins.append(min_val-1)
                else: 
                    bins.append(i*(max_val - min_val)/5)
            bins = [round(num) for num in bins]    
            print(bins)
            df_use[col] = pd.cut(df_use[col], bins=bins, labels=bin_labels)
            
        else:
            if len(df_use[df_use[col]<0])>0:
                total_null = len(df_use[df_use[col]<0])
                fraud_null = len(df_use[(df_use[col]<0) & (df_use['fraud_bool']==1)])
                # discretize the feature if more than 10% fraud values are negative 
                if fraud_null/total_fraud > .1:
                    print(col, ' discretize feature into 5 bins')
                    # Discretize the feature
                    min_val = df_use[col].min()
                    max_val = df_use[col].max()
                    bins = [min_val-1]
                    bin_labels = ['Missing']
                    for i in range(5):
                        bins.append(i*max_val/4)
                        bin_labels.append('bin_{}'.format(i))
                    bin_labels.pop()
                    df_use[col] = pd.cut(df_use[col], bins=bins, labels=bin_labels)
                else:
                    print(col, ' fill using median')
                    # Fill negative values with median vallue
                    med= df[col].median()
                    df_use.loc[df_use[col]<0,col]=med

income  discretize feature into 2 bins
prev_address_months_count  discretize feature into 5 bins
current_address_months_count  fill using median
customer_age  discretize feature into 2 bins
intended_balcon_amount  discretize feature into 5 bins
velocity_6h  fill using median
date_of_birth_distinct_emails_4w  discretize feature into 5 bins
[-1, 7, 15, 22, 30, 37]
credit_risk_score  fill using median
bank_months_count  discretize feature into 5 bins
[-2, 7, 13, 20, 26, 33]
proposed_credit_limit  discretize feature into 2 bins
session_length_in_minutes  fill using median
device_distinct_emails_8w  convert numeric to categorical feature


In [6]:
# split the base data into data to be simulated and data to be added to simulated data to represent noise

# df add will be used as noise and will not be trained on for the base model
df_add = df_use[df_use['month']==6]
df_add = df_add[df_add['fraud_bool']==0]

# df use will be base data inital model is trained on
df_use = df_use[df_use['month']==7]
df_use.drop('month',axis=1,inplace=True)
df_add.drop('month',axis=1,inplace=True)

In [7]:
# Splitting into dependent vs. independent features
y_use = df_use['fraud_bool']
x_use = df_use.drop('fraud_bool',axis=1)

### Categorical Conversion

In [8]:
# Convert Categorical
df_dummies = pd.get_dummies(x_use)

### Data Normalization

In [9]:
# Normalize all the columns to range between 0 and 1
x = df_dummies.values
cols = df_dummies.columns
min_max_scaler_base = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler_base.fit_transform(x)
x_use_scaled = pd.DataFrame(x_scaled, columns = cols)

### Train Test Split portion of overall data

In [10]:
x_train,  x_test, y_train , y_test = train_test_split(x_use_scaled, y_use, test_size=0.33, random_state=42)

### Feature Importance

In [11]:
# Rank Feature importance based on their mean decrease in impurity using random forest claissifer
clf = RandomForestClassifier(random_state=0)
features = x_train.columns
clf.fit(x_train, y_train)
importances = clf.feature_importances_
len(importances)

73

# Model Metric & Training Custom Functions

### Cost Savings Optimization Function 

In [12]:
### Define Custom Preformance Metrics Function
def optimize_cost(Cost_FN, Cost_FP,probabilities, y_test):
    # function to select optimal probability given the custom loss function and will return optimal prob 
    # for optimizing cost savings or f1 score
    opt_savings = 'NULL'
    opt_prob = 'NULL'
    opt_f1 = ['NULL','NULL']
    # iterate through and store optimal probability for cost savings and F1 Score
    for prob in np.arange(.01,1,.01):
        predictions = probabilities[:,1].copy()
        predictions[predictions>=prob] = 1
        predictions[predictions<prob] = 0
        cm = confusion_matrix(y_test, predictions)
        savings = cm[1,1]*Cost_FN - cm[0,1]*Cost_FP
        f1_score_iter = f1_score(y_test, predictions)
        if opt_savings=='NULL':
            opt_savings = savings
            opt_prob = prob
        elif savings > opt_savings:
            opt_savings = savings
            opt_prob = prob
        if opt_f1[0]=='NULL':
            opt_f1[0] = f1_score_iter
            opt_f1[1] = prob
        elif f1_score_iter > opt_f1[0]:
            opt_f1[0] = f1_score_iter
            opt_f1[1] = prob
    # calculate the F1 score given the cost savings optimal probability 
    predictions = probabilities[:,1]
    predictions[predictions>=opt_prob] = 1
    predictions[predictions<opt_prob] = 0 
    f1 = f1_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, probabilities[:, 1])
    return ([opt_prob, opt_savings,f1 ,precision,recall,roc_auc,opt_f1] )

# Model Building & Optimization on Base Data

### Base Models Selected are
    1. Gradient Boosting Classifier (All Features ROS .25)
    2. Stochastic Gradient Descent Classifier (All Features RUS .5)

In [13]:
# Define Optimization metrics for custom loss function
cost_fp = 10
cost_fn = median_fraud

### Model building: SGDClassifier

In [14]:
# Find Optimal Feature Sampeling & ratio Amount
print('starting SGDClassifier Optimization')
start_time = time.time()

# optimize parameters 
SGD = SGDClassifier(loss='log_loss')
SGD_parameters = {
        'alpha':[.01,.001,.0001],
        'max_iter':[200,1000,2000]
        }
opt_SGD = GridSearchCV(SGD, SGD_parameters)

# Apply Sampeling technique
oversample = RandomOverSampler(sampling_strategy=.25)
x_sampled, y_sampled = oversample.fit_resample(x_train,y_train)

# Fit and optimize the model
opt_SGD.fit(x_sampled, y_sampled)
probabilities_SGD = opt_SGD.predict_proba(x_test)

terms = optimize_cost(cost_fn, cost_fp,probabilities_SGD, y_test)
probabilities_SGD = opt_SGD.predict_proba(x_test)
probabilities_SGD = probabilities_SGD[:,1]


print("Optimal Confidence for savings = {} @ savings = {}".format(terms[0], terms[1]))
SGD_Savings_proba = terms[0]
print("F1 training metric for cost savings optimization = ",terms[2])
SGD_Savings_F1 = terms[2]
print("Optimal Confidence for F1 = {} @ f1 score = {}".format(terms[-1][1], terms[-1][0]))
SGD_F1_proba = terms[-1][1]
SGD_F1_F1 =  terms[-1][0]
print("Seconds Taken to SGDClassifier: ", (time.time() - start_time))

starting SGDClassifier Optimization
Optimal Confidence for savings = 0.81 @ savings = 1306.2117510278113
F1 training metric for cost savings optimization =  0.22130013831258644
Optimal Confidence for F1 = 0.66 @ f1 score = 0.2803445575567737
Seconds Taken to SGDClassifier:  8.040725946426392


### Model building: GradientBoostingClassifier

In [15]:
# Find Optimal Feature Sampeling & ratio Amount
print('starting GradientBoostingClassifier Optimization')
start_time = time.time()

    
# optimize parameters 
GBC = GradientBoostingClassifier()
GBC_parameters = {
        'n_estimators':[50,100,200],
        'max_depth':[2,3,5],
        'min_samples_split':[2,10]
    }
opt_GBC = GridSearchCV(GBC, GBC_parameters)

# Apply Sampeling technique
undersample = RandomUnderSampler(sampling_strategy=.5)
x_sampled, y_sampled = undersample.fit_resample(x_train,y_train)

# Fit and optimize the model
opt_GBC.fit(x_sampled, y_sampled)
probabilities_GBC = opt_GBC.predict_proba(x_test)
terms = optimize_cost(cost_fn, cost_fp,probabilities_GBC, y_test)
probabilities_GBC = opt_GBC.predict_proba(x_test)
probabilities_GBC = probabilities_GBC[:,1]

print("Optimal Confidence for savings = {} @ savings = {}".format(terms[0], terms[1]))
GBC_Savings_proba = terms[0]
print("F1 training metric for cost savings optimization = ",terms[2])
GBC_Savings_F1 = terms[2]
print("Optimal Confidence for F1 = {} @ f1 score = {}".format(terms[-1][1], terms[-1][0]))
GBC_F1_proba = terms[-1][1]
GBC_F1_F1 =  terms[-1][0]
print("Seconds Taken to SGDClassifier: ", (time.time() - start_time))

print("Seconds Taken to GradientBoostingClassifier: ", (time.time() - start_time))

starting GradientBoostingClassifier Optimization
Optimal Confidence for savings = 0.92 @ savings = 1639.7249854670495
F1 training metric for cost savings optimization =  0.23876404494382023
Optimal Confidence for F1 = 0.86 @ f1 score = 0.27755905511811024
Seconds Taken to SGDClassifier:  90.84873008728027
Seconds Taken to GradientBoostingClassifier:  90.84873008728027


# Monte Carlo Simulation 

### Simulation Outline
    1. Simulation has 2 for loops outer loop is number of complete iterations to run inner loop is number of simulated days in one iteration
    2. Inner loop should creates 20 new datasets in each loop with each day having 10% new data injected into the non fraud simulated data and a complete distribution shift to fraud on days 4, 9, & 14. Each day simualtes data from the previous day before noise is added to it. 
    3. new data is statistically similar and sampeled from the base data
    4. each day each of the 9 strategies per model predicts on the data to get results than tries to detect concept drift given its strategy. If it detects concept drift it retrains given its retrain strategy

### Concept Detection & Retraining Stragies Outline (total 9 strategies)

### Defining Concept Drift Detection Strategies

    1. Retrain once testing(New Day) f1 score drops below its f1 training or re-training metric. 
    2. Retrain using statistical signficance tsest Kolmogorov-Smirnov test for goodness of fit on each of the continous numeric features and chi Squared test on categorical features. In cases where goodness of fit does not work we use a distance measurment and compare that new days distance measurment to the previous days means plus 1 standrd deviation distance measurments. if the distance excedes this number we consider the distribution changes. If more than a certain threshold of features show significant shift retrain. 
    3. Retrain using Population Stability Index on models predicted probability and retrain if PSI is greater than .1

### Defining Retraining Strategies

    1. Retrain on current days data using (RUS mintority class @ 50%)
    2. Retrain on past up to past 3 days data using (RUS mintority class @ 50%)
    3. Retrain on current days data using (RUS mintority class @ 50%) + add previous past 2 days data with undersampeling less than 50% giving more emphasis to current days data

### Data Pre-Processing

In [16]:
# function takes dataframe and preforms pre processing to get dummy varibels and ensure all neccesary columns are present
def pre_processing(data_frame,req_cols,model_dict_obj=None,retrain=0):
    y = data_frame['fraud_bool']
    x = data_frame.drop(['fraud_bool','day'],axis=1).copy()

    # Check for days or fraud_bool in req cols
    for col in req_cols:
        if col in ['day','fraud_bool']:
            req_cols.remove(col)
            
    # Convert Categorical
    df_dummies = pd.get_dummies(x)

    
    x_vals = df_dummies.values
    cols = df_dummies.columns
    # if retrain is zero use model objects min max class
    if retrain == 0:
        min_max_scaler = model_dict_obj[11]
        x_scaled = min_max_scaler.fit_transform(x_vals)
        x_use_scaled = pd.DataFrame(x_scaled, columns = cols)
    
    # if we are retraining create a new min man object to store in the model object 
    else:
        min_max_scaler = preprocessing.MinMaxScaler()
        x_scaled = min_max_scaler.fit_transform(x_vals)
        x_use_scaled = pd.DataFrame(x_scaled, columns = cols)
    
    # find columns that are not in new data that are in base data and add them to the base data
    add_cols = list(set(req_cols) - set(cols))
    subtract_cols = list(set(cols) - set(req_cols))
    if len(add_cols) > 0:
        for new_col in add_cols:
            if new_col == 'day':
                continue
            else:
                x_use_scaled[new_col]=0
    # Remove columns that are not in base data that are in new data
    if len(subtract_cols) > 0:
        for new_col in subtract_cols:
            if new_col == 'day':
                continue
            else:
                x_use_scaled = x_use_scaled.drop(new_col,axis=1)
    # Return the object used for min max scaling as well as the pre-processed x and y values 
    x_use_scaled = x_use_scaled[list(req_cols)]
    return(x_use_scaled, y, min_max_scaler)

### Retraining Functions

In [17]:
# function used to retrain on simulated data given the strategies retrain method
def re_training(data,sampeling_ratio,strategy,base_model,req_cols):
    max_day = data['day'].max()

    # retrain on 1 days worth of data
    if strategy==1:  
        
        re_train = data[data['day']==max_day]
        
        # Pre processing for new data
        x_train,y_train,min_max_scaler = pre_processing(re_train,req_cols,retrain=1)
        
        # sampeling 
        undersample = RandomUnderSampler(sampling_strategy=sampeling_ratio)
        x_sampled, y_sampled = undersample.fit_resample(x_train,y_train)
        base_model.fit(x_sampled, y_sampled)
        return(base_model,x_train,y_train,min_max_scaler)
    
    # retrain on 3 days worth of data
    elif strategy==2: 
        
        
        re_train = data[data['day']>max_day-3]
        # Pre processing for new data
        x_train,y_train,min_max_scaler = pre_processing(re_train,req_cols,retrain=1)
        
        # Sampeling 
        undersample = RandomUnderSampler(sampling_strategy=sampeling_ratio)
        x_sampled, y_sampled = undersample.fit_resample(x_train,y_train)
        base_model.fit(x_sampled, y_sampled)
        return(base_model,x_train,y_train,min_max_scaler)
    # retrain on 3 days worth of data but give more weight to most recent day
    else:
        
        
        re_train = data[data['day']>(max_day-3)]
        days = re_train['day']
        # Pre processing for new data
        
        x_train,y_train,min_max_scaler = pre_processing(re_train,req_cols,retrain=1)
        
        # sampeling for new days data
        x_train['fraud_bool'] = list(y_train)
        post_proc_data = x_train.copy()
        x_train.drop('fraud_bool',axis=1,inplace=True)
        
        
        post_proc_data['day']= list(days)
        
        
        re_train = post_proc_data[post_proc_data['day']==max_day]
        
        x_train=re_train.drop(['fraud_bool','day'],axis=1)
        y_train=re_train['fraud_bool']
        
        
        oversample = RandomUnderSampler(sampling_strategy=sampeling_ratio)
        x_sampled_curr, y_sampled_curr = oversample.fit_resample(x_train,y_train)
        
        # samepling for other days that is not new days data
        add_data = post_proc_data[post_proc_data['day']!=max_day]
        x_train=add_data.drop(['fraud_bool','day'],axis=1)
        y_train=add_data['fraud_bool']
        
        fraud_curr_ratio = y_train.sum()/len(y_train)
        
        # use lower sampeling ratio for these days to give more wieght to newer data
        if (fraud_curr_ratio/(1-fraud_curr_ratio))>(sampeling_ratio/3):
            x_sampled_past, y_sampled_past = x_train, y_train
        else:
            oversample = RandomUnderSampler(sampling_strategy=sampeling_ratio/3)
            x_sampled_past, y_sampled_past = oversample.fit_resample(x_train,y_train)
        
        
        x_sampled =pd.concat([x_sampled_curr,x_sampled_past],ignore_index=True)
        y_sampled = pd.concat([y_sampled_curr,y_sampled_past],ignore_index=True)
        
        
        
        base_model.fit(x_sampled, y_sampled)
        return(base_model,x_train,y_train,min_max_scaler)
        

### Create Concept Drift Detection Function

In [18]:
# functions below used to detect concept drift then retrain using strategies detection and retrain methods

# function compares current days metric to the training metric and retrains if it is lower by a certain threshold
def cd_detection_metric_comparison(current_metric, training_metric, threshold,model_dictionary,data,day,sim,concept_drift,Cost_FN, Cost_FP,sampeling_ratio=.5):
    CD_tracked = ['Day','Simulation','CD_Detected','CD_Occured']
    
    # for each day check if the new days metric is below the training training metric minus the threshold
    if ((training_metric - current_metric) > threshold) and (day > 2):
        print('        Drift Detected using: Metric Comparision')
        print('        Current_Metric: ',current_metric)
        print('        Train Metric: ',training_metric)
        
        metric_df = pd.DataFrame([[day,sim,1,concept_drift]], columns=CD_tracked)
        new_df = pd.concat([model_dictionary[9], metric_df],ignore_index=True)
        
        # if concept drift detected retrain the model and update the model object with neccessary values
        new_model, x, y,min_max_scaler = re_training(data,sampeling_ratio,model_dictionary[1],model_dictionary[0],model_dictionary[10])
        
        
        # Compute Training metric with new model
        preds = new_model.predict(x)
        pred_prob = new_model.predict_proba(x)
        res = optimize_cost(Cost_FN, Cost_FP, pred_prob, y)
        #new_train_metric = f1_score(y, preds)
        new_train_metric = res[2]
        new_opt_prob = res[0]
        
        # udpate the neccessary model values 
        model_dictionary[9] = new_df
        model_dictionary[0] = new_model
        model_dictionary[3] = new_train_metric
        model_dictionary[7] = new_opt_prob
        model_dictionary[11] = min_max_scaler
        model_dictionary[5] = pred_prob[:,1]
        return (model_dictionary)
    else:
        # if no concept drift detected modify concept drift data tracking metrics and return unchanged model object
        metric_df = pd.DataFrame([[day,sim,0,concept_drift]], columns=CD_tracked)
        new_df = pd.concat([model_dictionary[9], metric_df],ignore_index=True)
        model_dictionary[9] = new_df
        return(model_dictionary)

# function compares the fraud distribution of new day to previous days to detect concept drift   
def cd_detection_Feature_distribution(all_data, threshold_criteria,model_dictionary,day,sim,concept_drift,Cost_FN, Cost_FP,sampeling_ratio=.5):
    CD_tracked = ['Day','Simulation','CD_Detected','CD_Occured']
    features_used = all_data.drop(['day','fraud_bool'],axis=1,errors='ignore').columns
    
    curr_day = all_data['day'].max()
    # divide data into current days fraud distribution Vs. previous days fraud distribution
    base_data = all_data[(all_data['day']<curr_day) & (all_data['day']>=model_dictionary[12]) & (all_data['fraud_bool']==1)]
    new_data  = all_data[(all_data['day']==curr_day) & (all_data['fraud_bool']==1)]
    Features_detected = 0
    # first 2 days after first day are used to store probabity distribution distances for later comparision
    if day < 3 and day!=0:
        for col in features_used:
            # if col is numeric but less than 15 unique values treat it as a categorical feature
            if is_numeric_dtype(all_data[col]) and len(all_data[col].unique())<15:
                
                # concert feature into frequency values for each value of the feature then combine them and fill na with 0
                s1 = base_data.value_counts(col)/len(base_data)
                s2 = new_data.value_counts(col)/len(new_data)
                
                freq_df = pd.concat([s1, s2], axis=1,ignore_index=True)
                freq_df = freq_df.fillna(0)
                expected = freq_df.iloc[:,0]
                observed = freq_df.iloc[:,1]
                
                # calculate the distance between the current days probability distribution and past days then store in 
                # model object dataframe 
                prob_dist = distance.jensenshannon(expected, observed, axis=0)
                model_dictionary[13].loc[day,col] = prob_dist
            elif is_numeric_dtype(all_data[col])==False:
                
                s1 = base_data.value_counts(col)/len(base_data)
                s2 = new_data.value_counts(col)/len(new_data)
                freq_df = pd.concat([s1, s2], axis=1,ignore_index=True)
                freq_df = freq_df.fillna(0)
                expected = freq_df.iloc[:,0]
                observed = freq_df.iloc[:,1]

                prob_dist = distance.jensenshannon(expected, observed, axis=0)
                model_dictionary[13].loc[day,col] = prob_dist
    # third day is when we allow retrain to begin        
    if day > 2:
        feat_with_drift = []
        for col in features_used:
            
            if is_numeric_dtype(all_data[col]):
                
                # For Categorical need to check that all values are present in base data are in new_data
                if len(all_data[col].unique())<15:
                    s1 = base_data.value_counts(col)/len(base_data)
                    s2 = new_data.value_counts(col)/len(new_data)
                    
                    # convert categorical feature into frequency distributions
                    freq_df = pd.concat([s1, s2], axis=1,ignore_index=True)
                    freq_df = freq_df.fillna(0)
                    expected = freq_df.iloc[:,0]
                    observed = freq_df.iloc[:,1]

                    prob_dist = distance.jensenshannon(expected, observed, axis=0)
                    model_dictionary[13].loc[day,col] = prob_dist
                    
                    pct_dif = abs(freq_df.iloc[:,0].sum() - freq_df.iloc[:,1].sum())
                    
                    # if observed or expected contains 0 or the sums of observed and expected are diffrent 
                    # use distance measurment to detect drift 
                    if freq_df.isin([0]).any().any() or pct_dif > .00005:
                        
                        avg = model_dictionary[13][model_dictionary[13]['CD_Detected']==0][col].mean()
                        std = model_dictionary[13][model_dictionary[13]['CD_Detected']==0][col].std()
                        
                        # if new days distance metric is greater than the average * 1.5 stds we consider it drifted 
                        if prob_dist > (avg + std*1.5):
                            feat_with_drift.append(col)
                            Features_detected+=1
                    # else we use goodness of fit test to see if feature has dirfted 
                    else:
                        p_val = stats.chisquare(f_obs=observed, f_exp=expected).pvalue
                        
                        if p_val< .05:
                            feat_with_drift.append(col)
                            Features_detected+=1
                else:
                    
                    p_val = stats.ks_2samp(base_data[col], new_data[col]).pvalue
                    
                    if p_val< .05:
                        feat_with_drift.append(col)
                        Features_detected+=1
            # if feautre is not numeric use distance or chi squared test goodness of fit for feature drift detection
            else:
                s1 = base_data.value_counts(col)/len(base_data)
                s2 = new_data.value_counts(col)/len(new_data)
                freq_df = pd.concat([s1, s2], axis=1,ignore_index=True)
                freq_df = freq_df.fillna(0)
                expected = freq_df.iloc[:,0]
                observed = freq_df.iloc[:,1]
                
                pct_dif = abs(freq_df.iloc[:,0].sum() - freq_df.iloc[:,1].sum())
                
                prob_dist = distance.jensenshannon(expected, observed, axis=0)
                model_dictionary[13].loc[day,col] = prob_dist
                
                if freq_df.isin([0]).any().any() or pct_dif > .00005:
                    avg = model_dictionary[13][model_dictionary[13]['CD_Detected']==0][col].mean()
                    std = model_dictionary[13][model_dictionary[13]['CD_Detected']==0][col].std()
                    
                    if prob_dist > (avg + std*1.5):
                        feat_with_drift.append(col)
                        Features_detected+=1       
                else:
                    p_val = stats.chisquare(f_obs=observed, f_exp=expected).pvalue
                    if p_val< .05:
                        feat_with_drift.append(col)
                        Features_detected+=1
    
    # if there are more than the theshold of features detected to have drift preform retraining
    if (Features_detected> threshold_criteria) and (day > 2):
        print('        Drift Detected using: Feature Comparision')
        #print('        Features that drifted', feat_with_drift)
        metric_df = pd.DataFrame([[day,sim,1,concept_drift]], columns=CD_tracked)
        new_df = pd.concat([model_dictionary[9], metric_df],ignore_index=True)
        
        # retrain and return model and sclaer object used to retrain
        new_model,x,y,min_max_scaler = re_training(all_data,sampeling_ratio,model_dictionary[1],model_dictionary[0],model_dictionary[10])
        
        # obtain new model object values and assign them to the model object
        preds = new_model.predict(x)
        pred_prob = new_model.predict_proba(x)
        res = optimize_cost(Cost_FN, Cost_FP, pred_prob, y)
        #new_train_metric = f1_score(y, preds)
        new_train_metric = res[2]
        new_opt_prob = res[0]
        
        model_dictionary[9] = new_df
        model_dictionary[0] = new_model
        model_dictionary[3] = new_train_metric
        model_dictionary[5] = pred_prob[:,1]
        model_dictionary[7] = new_opt_prob
        model_dictionary[11] = min_max_scaler
        model_dictionary[12] = day
        model_dictionary[13].loc[day,'CD_Detected'] = 1
        return(model_dictionary)
    else:
        metric_df = pd.DataFrame([[day,sim,0,concept_drift]], columns=CD_tracked)
        new_df = pd.concat([model_dictionary[9], metric_df],ignore_index=True)
        model_dictionary[9] = new_df
        model_dictionary[13].loc[day,'CD_Detected'] = 0
        return(model_dictionary)

# function calculates the Population Stability index and retrains if it is higher than a certain theshold
def cd_PSI(training_prob_dist, new_prob_dist, PSI_Threshold,model_dictionary,data,day,sim,concept_drift,Cost_FN, Cost_FP,sampeling_ratio=.5):
    CD_tracked = ['Day','Simulation','CD_Detected','CD_Occured']
    
    prob_df = pd.DataFrame()
    prob_df_new = pd.DataFrame()
    # split the training fraud propbability and new days fraud probability distributions 
    prob_df['Probabilites_old'] = training_prob_dist
    prob_df_new['Probabilites_new'] = new_prob_dist
    
    
    
    
    # convert the data into 5 bins based on the training data and get relative frequencies for each of those bins for
    # training and new days data
    bins = pd.qcut(prob_df['Probabilites_old'], 5,retbins=True, duplicates='drop')[1]
    s1 = pd.cut(prob_df['Probabilites_old'], bins).value_counts() / len(prob_df['Probabilites_old'])
    s2 = pd.cut(prob_df_new['Probabilites_new'], bins).value_counts() / len(prob_df_new['Probabilites_new'])
    PSI_df = pd.concat([s1, s2], axis=1)
    #PSI_df=PSI_df.replace(0,.0001)
    if len(prob_df_new['Probabilites_new'].unique())<6:
        print('-----------Model Issue Unique Probabilites fall into less than 3 buckets-------------')
        print('new probas')
        print(prob_df_new['Probabilites_new'].unique())
        print('new old')
        print(prob_df_new['Probabilites_old'].unique())
    
    # Caluclate the PSI metric
    PSI_calc = ((PSI_df['Probabilites_new']- PSI_df['Probabilites_old']) * np.log(PSI_df['Probabilites_new']/PSI_df['Probabilites_old'])).sum()
    
    # retrain if the calculated metric is large than the threshold 
    if (PSI_calc> PSI_Threshold) and (day > 2):
        print('        Drift Detected using: Probability Comparision')
        print('        ',PSI_calc)
        new_model,x,y,min_max_scaler = re_training(data,sampeling_ratio,model_dictionary[1],model_dictionary[0],model_dictionary[10])
        
        metric_df = pd.DataFrame([[day,sim,1,concept_drift]], columns=CD_tracked)
        new_df = pd.concat([model_dictionary[9], metric_df],ignore_index=True)
        

        preds = new_model.predict(x)
        pred_prob = new_model.predict_proba(x)
        res = optimize_cost(Cost_FN, Cost_FP, pred_prob, y)
        #new_train_metric = f1_score(y, preds)
        new_train_metric = res[2]
        new_opt_prob = res[0]
        
        model_dictionary[9] = new_df
        model_dictionary[0] = new_model
        model_dictionary[3] = new_train_metric
        model_dictionary[5] = pred_prob[:,1]
        model_dictionary[7] = new_opt_prob
        model_dictionary[11] = min_max_scaler
        return(model_dictionary)
    else:
        metric_df = pd.DataFrame([[day,sim,0,concept_drift]], columns=CD_tracked)
        new_df = pd.concat([model_dictionary[9], metric_df],ignore_index=True)
        model_dictionary[9] = new_df
        return(model_dictionary)

In [19]:
# Function that predicts fraud and finds optimal probability to predict fraud given the assoicated FN and FP costs
def model_metric_predict(model_object, x_test, y_test, day, simulation,Cost_FN,Cost_FP):
    metrics = ['Day','Simulation','Recall','F1 Score','Cost-Savings','Accuracy','Precision','Num_FPs','Num_TPs']
    probabilities = model_object[0].predict_proba(x_test)
    predictions =  probabilities[:,1].copy()
    
    
    prob = model_object[7]

    predictions[predictions>=prob] = 1
    predictions[predictions<prob] = 0
    cm = confusion_matrix(y_test, predictions)

    
    Recall = recall_score(y_test, predictions)
    F1_Score = f1_score(y_test, predictions)
    Cost_Savings = cm[1,1]*Cost_FN - cm[0,1]*Cost_FP
    Accuracy = accuracy_score(y_test, predictions)
    Precision = precision_score(y_test, predictions)
    Num_FPs = cm[0,1]
    Num_TPs = cm[1,1]
    metric_df = pd.DataFrame([[day,simulation, Recall,F1_Score,Cost_Savings,Accuracy,Precision,Num_FPs,Num_TPs]], columns=Metrics_tracked)
    new_df = pd.concat([model_object[8], metric_df],ignore_index=True)
    probabilities = model_object[0].predict_proba(x_test)
    predictions =  probabilities[:,1].copy()
    return(new_df,F1_Score,predictions)
    

In [20]:
# Function resets each model object for each strategy at the begining of each simulation
def reset_model_dictionary(model_dictionary,Base_model_dict):
    for key, model_dict in model_dictionary.items():
        model_dictionary[key][0] = deepcopy(Base_model_dict[key][0])
        model_dictionary[key][3] = deepcopy(Base_model_dict[key][3])
        model_dictionary[key][4] = deepcopy(Base_model_dict[key][4])
        model_dictionary[key][5] = deepcopy(Base_model_dict[key][5])
        model_dictionary[key][7] = deepcopy(Base_model_dict[key][7])
        model_dictionary[key][11] = deepcopy(Base_model_dict[key][11])
        model_dictionary[key][12] = 0
        model_dictionary[key][13] = deepcopy(Base_model_dict[key][13])
        
    return(model_dictionary)

# Monte Claro Simulation

In [41]:
# Main simulation function that runs the simulation for X simulations and X days
def Simulate_Fraud (req_cols, base_data, model_dictionary, Iterations, Days,drift_days,time_between_drift, 
                    drift_magnitude,Cost_FN, Cost_FP,Base_model_dict,df_min_max,df_add, Fraud_PCT=.01):
    # iterate through each simulation
    for sim in range(Iterations):
        print('Starting New Simulation {} of {}'.format(sim,Iterations))
        time_sense_drift = time_between_drift
        concept_drift = 0
        df_add['use'] = 0
        # reset the model objects after each simulation
        if sim > 0:
            model_dictionary = reset_model_dictionary(model_dictionary,Base_model_dict)
        
        # iterate over each days simulating new data and collecting each strategies metrics
        for day in range(Days):
            print('Starting New Day {} of {}'.format(day,Days))
            ### Creating New Data
            if day == 0:
                simulated_data = synth_data_lib_v2_extra_drift.create_new_dataset (base_data,20000,Fraud_PCT,'fraud_bool',df_min_max,df_add)
                simulated_data['day']=day
                new_sim_data = simulated_data.copy()
            else:
                new_base_data = simulated_data[simulated_data['day']==day-1]
                new_base_data.drop('day',axis=1,inplace=True)
            if ( day < 3 and day > 0):
                new_sim_data = synth_data_lib_v2_extra_drift.create_new_dataset (new_base_data,20000,Fraud_PCT,'fraud_bool',df_min_max,df_add)
                new_sim_data['day']=day
            if day >= 3 :
                if (day in drift_days and time_sense_drift<1):
                    print('Inserting drift')
                    concept_drift = 1
                    time_sense_drift = time_between_drift
                    new_sim_data = synth_data_lib_v2_extra_drift.create_new_dataset (new_base_data,20000,Fraud_PCT,'fraud_bool',df_min_max,df_add,True,drift_magnitude)
                    new_sim_data['day']=day
                else:
                    new_sim_data = synth_data_lib_v2_extra_drift.create_new_dataset (new_base_data,20000,Fraud_PCT,'fraud_bool',df_min_max,df_add)
                    new_sim_data['day']=day
            
            ### Store new Data in Simulation total data dataframe
            if day != 0:
                simulated_data = pd.concat([simulated_data,new_sim_data],ignore_index=True)

            print('tot Records: ', len(new_sim_data))
            print('frd records: ', len(new_sim_data[new_sim_data['fraud_bool']==1]))
            # iterate through each model strategy gather metrics, detect concept drift and retrain if neccessary 
            for key, model_dict in model_dictionary.items():
                
                
                
                ### Pre Process New data
                x_test, y_test, min_max_throwaawy = pre_processing(new_sim_data, req_cols, model_dict_obj=model_dict)
                
                ## predict on new data per model
                
                model_results,test_metric,new_prob_dist = model_metric_predict(model_dict, x_test, y_test, day, sim,Cost_FN,Cost_FP)
                
                ## Store results in res dataframe
                model_dictionary[key][8]= model_results
                
                ## detect on new data for every detection strat & Retrain if CD Detected
                if model_dict[2]=='Preformance':
                    new_mod_dict = cd_detection_metric_comparison(test_metric, model_dict[3], .15 ,model_dict,simulated_data,day,sim,concept_drift,Cost_FN, Cost_FP)
                    model_dictionary[key] = new_mod_dict
                
                if model_dict[2]=='Feature_dist':
                    new_mod_dict = cd_detection_Feature_distribution(simulated_data, 12,model_dict,day,sim,concept_drift,Cost_FN, Cost_FP)
                    model_dictionary[key] = new_mod_dict
                    
                if model_dict[2]=='PSI':
                    new_mod_dict = cd_PSI(model_dict[5], new_prob_dist, .1,model_dict,simulated_data,day,sim,concept_drift,Cost_FN, Cost_FP)
                    model_dictionary[key] = new_mod_dict
                    

            concept_drift = 0
            time_sense_drift -= 1

In [42]:
# Intalize dataframe to hold a list of results per iteration for each strategy 
Metrics_tracked = ['Day','Simulation','Recall','F1 Score','Cost-Savings','Accuracy','Precision','Num_FPs','Num_TPs']
metric_results = pd.DataFrame(columns = Metrics_tracked)

# Intalize dataframe to hold a list of CD metrics to track 
CD_tracked = ['Day','Simulation','CD_Detected','CD_Occured']
CD_results = pd.DataFrame(columns = CD_tracked)

# define simulation metrics
Number_of_simulations = 20
Number_of_days = 20
drift_days = [4,9,14]
time_between_drift= 3
drift_magnitude = 100
# Define Optimization metrics for custom loss function
cost_fp = 10
cost_fn = median_fraud
req_feats = list(x_train.columns)

# Create Probability Distance dataframe object
inds = list(range(Number_of_days))
cols = req_feats.copy()
cols.append('CD_Detected')
prob_dist_df = pd.DataFrame(columns=req_feats,index = inds)

# Prepare Base data to be simulated 
base_data = x_train.copy()
req_cols_base = list(x_train.columns)
base_data['fraud_bool']=y_train
# Intalize dictionary that will hold list of neccessary vars for model instances for each strategy that is updated 
# within the simulation 


Detection_Strategies = ['Preformance','Feature_dist','PSI']
Retraining_strategies = [1,2,3]

# Model object data dictionary
# model_dictionary['ModelName_DetectionStrategy_RetrainStategy'] = [#0 model_instace,
                                                                    #1 re_train strat
                                                                    #2 detection Srat
                                                                    #3 training_metric,
                                                                    #4 base_data,
                                                                    #5 training_prob_dist
                                                                    #6 model optimization parm dict
                                                                    #7 fraud predict proba
                                                                    #8 Model Results datframe
                                                                    #9 CD Detection Result
                                                                    #10 required_columns
                                                                    #11 Normalization object used in training
                                                                    #12 last_day_cd_detected
                                                                    #13 Probability Distance]

# Intalize the 9 strategy combinations per base model
# SGD Strategies (creates the 9 strategy combinations for the SGD Model)
model_dictionary = {}
for d_strat in Detection_Strategies:
    for r_strat in Retraining_strategies:
        key = 'SGD_{}_{}'.format(d_strat,r_strat)
        SGD_Preformance = metric_results.copy()
        SGD_CD_pref = CD_results.copy()
        SGD_prob_dist = prob_dist_df.copy()
        new_mod = deepcopy(opt_SGD)
        new_prob_opt_SGD = deepcopy(SGD_Savings_proba)
        copy_min_max_scaler_base = deepcopy(min_max_scaler_base)
        model_dictionary[key] = [new_mod, #0 model_instace,
                                r_strat, #1 re_train strat
                                d_strat, #2 detection strat
                                SGD_Savings_F1,  #3 training_metric,
                                None,  #4 base_data,
                                probabilities_SGD,  #5 training_prob_dist
                                SGD_parameters,  #6 model optimization parm dict
                                new_prob_opt_SGD,  #7 fraud predict proba
                                SGD_Preformance, #8 Model Results datframe
                                SGD_CD_pref, #9 CD Detection Result
                                req_cols_base, #10 required_columns
                                copy_min_max_scaler_base,#11 Normalization object  
                                0,#12 last_day_cd_detected
                                SGD_prob_dist] #13 Probability Distance

# GBC Strategies (creates the 9 strategy combinations for the SGD Model)
for d_strat in Detection_Strategies:
    for r_strat in Retraining_strategies:
        key = 'GBC_{}_{}'.format(d_strat,r_strat)
        GBC_Preformance = metric_results.copy()
        GBC_CD_pref = CD_results.copy()
        GBC_prob_dist = prob_dist_df.copy()
        new_mod = deepcopy(opt_GBC)
        new_prob_opt_GBC = deepcopy(GBC_Savings_proba)
        copy_min_max_scaler_base = deepcopy(min_max_scaler_base)
        model_dictionary[key] = [new_mod,
                                r_strat,
                                d_strat,
                                GBC_Savings_F1,
                                None,
                                probabilities_GBC,
                                GBC_parameters,
                                new_prob_opt_GBC,
                                GBC_Preformance,
                                GBC_CD_pref,
                                req_cols_base,
                                copy_min_max_scaler_base,
                                0,
                                GBC_prob_dist]

# Add in base models that will never re-train

new_mod_SGD = deepcopy(opt_SGD)
SGDcopy_min_max_scaler_base = deepcopy(min_max_scaler_base)
model_dictionary['SGD_BASE_BASE'] = [new_mod_SGD, #0 model_instace,
                                    None, #1 detection Srat, 
                                    None, #2 re_train strat
                                    None,  #3 training_metric,
                                    None,  #4 base_data,
                                    None,  #5 training_prob_dist
                                    None,  #6 model optimization parm dict
                                    SGD_Savings_proba,  #7 fraud predict proba
                                    SGD_Preformance, #8 Model Results datframe
                                    None, #9 CD Detection Result
                                    req_cols_base,#10 required_columns
                                    SGDcopy_min_max_scaler_base,#11 Normalization object 
                                    None,
                                    None] #12 last day cd detected

copy_min_max_scaler_base = deepcopy(min_max_scaler_base)
new_mod_GBC = deepcopy(opt_GBC)
model_dictionary['GBC_BASE_BASE'] = [new_mod_GBC,
                                    None,
                                    None,
                                    None,
                                    None,
                                    None,
                                    None,
                                    GBC_Savings_proba,
                                    GBC_Preformance,
                                    None,
                                    req_cols_base,
                                    copy_min_max_scaler_base,
                                    None,
                                    None]

# Store the Numeric Min Max Values of original Data set
org_cols = list(df_use.columns)
df_min_max = pd.DataFrame(columns = org_cols, index= ['Min','Max'])

for col in org_cols:
    if df_use[col].dtype in ['int64', 'float64']:
        df_min_max.loc['Min',col]=df_use[col].min()
        df_min_max.loc['Max',col]=df_use[col].max()





In [43]:
# get copy of orginal model object
base_model_dic = deepcopy(model_dictionary)

In [44]:
# run simulation
Simulate_Fraud (req_feats, df_use, model_dictionary, Number_of_simulations, Number_of_days,drift_days, 
                    time_between_drift, drift_magnitude,cost_fn,cost_fp,base_model_dic,df_min_max,df_add,Fraud_PCT = .01)

Starting New Simulation 0 of 20
Starting New Day 0 of 20
tot Records:  20000
frd records:  200
Starting New Day 1 of 20
tot Records:  20000
frd records:  200
Starting New Day 2 of 20
tot Records:  20000
frd records:  200
Starting New Day 3 of 20
tot Records:  20000
frd records:  200
Starting New Day 4 of 20
Inserting drift
tot Records:  20000
frd records:  200
        Drift Detected using: Metric Comparision
        Current_Metric:  0.0
        Train Metric:  0.22130013831258644
        Drift Detected using: Metric Comparision
        Current_Metric:  0.0
        Train Metric:  0.22130013831258644
        Drift Detected using: Metric Comparision
        Current_Metric:  0.0
        Train Metric:  0.22130013831258644
        Drift Detected using: Feature Comparision
        Drift Detected using: Feature Comparision
        Drift Detected using: Feature Comparision
        Drift Detected using: Metric Comparision
        Current_Metric:  0.009302325581395349
        Train Metric:  0.2387

KeyboardInterrupt: 

In [None]:
# Save results for metrics and Concept Drift Metrics to csv fiels
for key, model_dict in model_dictionary.items():
    res_name = key+'_metrics.csv'
    cd_name =  key+'_Concept_Drift.csv'
    model_dict[8].to_csv(res_name)
    try:
        model_dict[9].to_csv(cd_name)
    except:
        print(key)

In [26]:
Detection_Strategies = ['Preformance','Feature_dist','PSI']
Retraining_strategies = ['1','2','3']
Model_Types = ['SGD','GBC']

start = 0

# for each csv file combine the metrics and concept drift of each strategy into one dataframe with all values
for d_strat in Detection_Strategies:
    for r_strat in Retraining_strategies:
        for mod_name in Model_Types:
            f_name_metrics = mod_name+'_'+d_strat+'_'+r_strat+'_metrics.csv'
            f_name_CD = mod_name+'_'+d_strat+'_'+r_strat+'_Concept_Drift.csv'
            Strategy = mod_name+'_'+d_strat+'_'+r_strat
            
            if start == 0:       
                ALL_df_metric = pd.read_csv(f_name_metrics)
                ALL_df_metric['Strategy']=Strategy
                ALL_df_metric['Model'] = mod_name
                ALL_df_metric['Retrain_strategy'] = r_strat
                ALL_df_metric['Detection_Strategy'] = d_strat
                
                ALL_df_cd = pd.read_csv(f_name_CD)
                ALL_df_cd['Strategy']=Strategy
                ALL_df_cd['Model'] = mod_name
                ALL_df_cd['Retrain_strategy'] = r_strat
                ALL_df_cd['Detection_Strategy'] = d_strat
                start = 1
            else:
                df_metric = pd.read_csv(f_name_metrics)
                df_metric['Strategy']=Strategy
                df_metric['Model'] = mod_name
                df_metric['Retrain_strategy'] = r_strat
                df_metric['Detection_Strategy'] = d_strat
                ALL_df_metric = pd.concat([ALL_df_metric, df_metric])
                
                df_cd = pd.read_csv(f_name_CD)
                df_cd['Strategy']=Strategy
                df_cd['Model'] = mod_name
                df_cd['Retrain_strategy'] = r_strat
                df_cd['Detection_Strategy'] = d_strat
                ALL_df_cd = pd.concat([ALL_df_cd, df_cd])

# store combined dataframe into one dataframe                
All_Values_df = ALL_df_metric.merge(ALL_df_cd,on= ['Day','Simulation','Strategy','Model','Retrain_strategy','Detection_Strategy'])

In [27]:
# save dataframe to file
All_Values_df.to_csv('alll_vals-10-31-2023-extra-drift.csv')

In [28]:
All_Values_df

Unnamed: 0,Unnamed: 0_x,Day,Simulation,Recall,F1 Score,Cost-Savings,Accuracy,Precision,Num_FPs,Num_TPs,Strategy,Model,Retrain_strategy,Detection_Strategy,Unnamed: 0_y,CD_Detected,CD_Occured
0,0,0,0,0.125,0.178571,317.566172,0.98850,0.312500,55,25,SGD_Preformance_1,SGD,1,Preformance,0,0,0
1,1,1,0,0.145,0.222222,686.376760,0.98985,0.475410,32,29,SGD_Preformance_1,SGD,1,Preformance,1,0,0
2,2,2,0,0.120,0.173913,312.863525,0.98860,0.315789,52,24,SGD_Preformance_1,SGD,1,Preformance,2,0,0
3,3,3,0,0.110,0.169884,393.458232,0.98925,0.372881,37,22,SGD_Preformance_1,SGD,1,Preformance,3,0,0
4,4,4,0,0.000,0.000000,-500.000000,0.98750,0.000000,50,0,SGD_Preformance_1,SGD,1,Preformance,4,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7195,395,15,19,0.005,0.007843,-505.297353,0.98735,0.018182,54,1,GBC_PSI_3,GBC,3,PSI,395,0,0
7196,396,16,19,0.005,0.007968,-465.297353,0.98755,0.019608,50,1,GBC_PSI_3,GBC,3,PSI,396,0,0
7197,397,17,19,0.000,0.000000,-380.000000,0.98810,0.000000,38,0,GBC_PSI_3,GBC,3,PSI,397,0,0
7198,398,18,19,0.000,0.000000,-410.000000,0.98795,0.000000,41,0,GBC_PSI_3,GBC,3,PSI,398,0,0
