In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/banksim1/bs140513_032310.csv
/kaggle/input/banksim1/bsNET140513_032310.csv


In [2]:
# Necessary imports

## Data loading, processing and for more
import pandas as pd
import numpy as np

## Visualization
import matplotlib.pyplot as plt

## Metrics
from sklearn.model_selection import train_test_split


In [3]:
data = pd.read_csv("../input/banksim1/bs140513_032310.csv")
data.head(5)

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


In [4]:
print("Unique zipCodeOri values: ",data.zipcodeOri.nunique())
print("Unique zipMerchant values: ",data.zipMerchant.nunique())
# dropping zipcodeori and zipMerchant since they have only one unique value
data_reduced = data.drop(['zipcodeOri','zipMerchant'],axis=1)

Unique zipCodeOri values:  1
Unique zipMerchant values:  1


In [5]:
data_reduced.columns

Index(['step', 'customer', 'age', 'gender', 'merchant', 'category', 'amount',
       'fraud'],
      dtype='object')

In [6]:
# turning object columns type to categorical for easing the transformation process
col_categorical = data_reduced.select_dtypes(include= ['object']).columns
for col in col_categorical:
    data_reduced[col] = data_reduced[col].astype('category')
# categorical values ==> numeric values
data_reduced[col_categorical] = data_reduced[col_categorical].apply(lambda x: x.cat.codes)
data_reduced.head(5)

Unnamed: 0,step,customer,age,gender,merchant,category,amount,fraud
0,0,210,4,2,30,12,4.55,0
1,0,2753,2,2,30,12,39.68,0
2,0,2285,4,1,18,12,26.89,0
3,0,1650,3,2,30,12,17.25,0
4,0,3585,5,2,30,12,35.72,0


In [7]:
X = data_reduced.drop(['fraud'],axis=1)
y = data['fraud']
print(X.head(),"\n")
print(y.head())

   step  customer  age  gender  merchant  category  amount
0     0       210    4       2        30        12    4.55
1     0      2753    2       2        30        12   39.68
2     0      2285    4       1        18        12   26.89
3     0      1650    3       2        30        12   17.25
4     0      3585    5       2        30        12   35.72 

0    0
1    0
2    0
3    0
4    0
Name: fraud, dtype: int64


In [8]:
#Number of Fraud cases 
y[y==1].count()

7200

In [9]:
#Number of non fraud cases
y[y==0].count()

587443

This shows there are 7200 postive fraudulent transactions

In [10]:
data_fraud_0 = data_reduced[data_reduced['fraud'] == 0]
data_fraud_1 = data_reduced[data_reduced['fraud'] == 1]

# Check the size of each subset
print("Size of data_fraud_0:", data_fraud_0.shape)
print("Size of data_fraud_1:", data_fraud_1.shape)
# Display first few rows of each subset to verify
print("First few rows of data_fraud_0:\n", data_fraud_0.head())
print("First few rows of data_fraud_1:\n", data_fraud_1.head())

Size of data_fraud_0: (587443, 8)
Size of data_fraud_1: (7200, 8)
First few rows of data_fraud_0:
    step  customer  age  gender  merchant  category  amount  fraud
0     0       210    4       2        30        12    4.55      0
1     0      2753    2       2        30        12   39.68      0
2     0      2285    4       1        18        12   26.89      0
3     0      1650    3       2        30        12   17.25      0
4     0      3585    5       2        30        12   35.72      0
First few rows of data_fraud_1:
      step  customer  age  gender  merchant  category  amount  fraud
88      0      3219    3       2        34         4   44.26      1
89      0       723    3       2        34         4  324.50      1
434     0       362    3       2        44         6  176.32      1
435     0      4040    3       2        44         6  337.41      1
553     0       969    4       1         2        14  220.11      1


In [11]:
# Assuming your dataset is already loaded into a DataFrame called data_reduced
# data_reduced = pd.read_csv('path_to_your_csv_file.csv')

# Separate the data based on the fraud column
data_fraud_0 = data_reduced[data_reduced['fraud'] == 0]
data_fraud_1 = data_reduced[data_reduced['fraud'] == 1]

# Split the fraud=0 data into train (60%), temp (40%)
train_fraud_0, temp_fraud_0 = train_test_split(data_fraud_0, test_size=0.4, random_state=42)

# Split the temp data into cross-validation (50% of temp, i.e., 20% of original fraud=0) and test (50% of temp, i.e., 20% of original fraud=0)
cv_fraud_0, test_fraud_0 = train_test_split(temp_fraud_0, test_size=0.5, random_state=42)

# Split the fraud=1 data into cross-validation (50%) and test (50%)
cv_fraud_1, test_fraud_1 = train_test_split(data_fraud_1, test_size=0.5, random_state=42)

# Combine splits to create final train, cross-validation, and test sets
train_set = train_fraud_0
cv_set = pd.concat([cv_fraud_0, cv_fraud_1])
test_set = pd.concat([test_fraud_0, test_fraud_1])

# Check the results
print("Train set size:", train_set.shape)
print("Cross-validation set size:", cv_set.shape)
print("Test set size:", test_set.shape)
print("Train set distribution:\n", train_set['fraud'].value_counts())
print("Cross-validation set distribution:\n", cv_set['fraud'].value_counts())
print("Test set distribution:\n", test_set['fraud'].value_counts())

Train set size: (352465, 8)
Cross-validation set size: (121089, 8)
Test set size: (121089, 8)
Train set distribution:
 fraud
0    352465
Name: count, dtype: int64
Cross-validation set distribution:
 fraud
0    117489
1      3600
Name: count, dtype: int64
Test set distribution:
 fraud
0    117489
1      3600
Name: count, dtype: int64


In [12]:
#removing the not very useful features in my opinion, tweaking with gender and age it's a little hard to tell if these are relevant
X_train = train_set.drop(['fraud', 'step', 'customer', 'merchant', 'gender', 'age'],axis=1)
X_cv = cv_set.drop(['fraud', 'step', 'customer', 'merchant', 'gender', 'age'],axis=1)
Y_cv = cv_set['fraud']
X_test = test_set.drop(['fraud', 'step', 'customer', 'merchant', 'gender', 'age'],axis=1)
Y_test = test_set['fraud']
y = data['fraud']
print(X_train.head(),"\n")
print(X_cv.head())

        category  amount
404873        12   10.86
140213        12   21.10
431098        12   55.22
264017         9   41.64
296327        12   22.28 

        category  amount
542826        12   10.89
198605         0   63.06
169541         2   56.32
476894        12   46.69
482791         4  300.57


In [13]:
#transform the data to make is more normally distributed 
X_train['amount'] = np.log1p(X_train['amount'])

X_cv['amount'] = np.log1p(X_cv['amount'])

X_test['amount'] = np.log1p(X_test['amount'])

print(X_train.head(),"\n")
print(X_cv.head())




        category    amount
404873        12  2.473171
140213        12  3.095578
431098        12  4.029273
264017         9  3.752793
296327        12  3.147595 

        category    amount
542826        12  2.475698
198605         0  4.159820
169541         2  4.048650
476894        12  3.864722
482791         4  5.709002


In [14]:
#Lets put this feature on hold
##X_train['log_age_amount'] = np.log1p(X_train['age'] * X_train['amount'])
##X_cv['log_age_amount'] = np.log1p(X_cv['age'] * X_cv['amount'])
##X_test['log_age_amount'] = np.log1p(X_test['age'] * X_test['amount'])

In [15]:
#One hot encoding the age and category and gender features
#X_train = pd.get_dummies(X_train, columns=['age'], prefix='age')
#X_cv = pd.get_dummies(X_cv, columns=['age'], prefix='age')
#X_test = pd.get_dummies(X_test, columns=['age'], prefix='age')

#X_train = pd.get_dummies(X_train, columns=['gender'], prefix='gender')
#X_cv = pd.get_dummies(X_cv, columns=['gender'], prefix='gender')
#X_test = pd.get_dummies(X_test, columns=['gender'], prefix='gender')

#X_train = pd.get_dummies(X_train, columns=['category'], prefix='category')
#X_cv = pd.get_dummies(X_cv, columns=['category'], prefix='category')
#X_test = pd.get_dummies(X_test, columns=['category'], prefix='category')


In [16]:
## can't seem to do this, not enough memory 
#import pandas as pd
#from itertools import combinations

#one_hot_columns = [col for col in X_train.columns if col.startswith('age_') or col.startswith('gender_') or col.startswith('category_')]

# Dictionary to hold new columns
#new_columns = {}

# Create interaction features
#for combo in combinations(one_hot_columns, 2):
#    feature_name = f"{combo[0]}_x_{combo[1]}"
#    new_columns[feature_name] = X_train[combo[0]] * X_train[combo[1]]

# Convert dictionary to DataFrame
#new_columns_df = pd.DataFrame(new_columns)

# Concatenate new columns to original DataFrame
#X_train = pd.concat([X_train, new_columns_df], axis=1)

# Continue with further feature engineering steps
# Binning log(amount) and log(age * amount)
#X_train['log_amount_binned'] = pd.cut(X_train['amount'], bins=10)
#X_train['log_age_amount_binned'] = pd.cut(X_train['log_age_amount'], bins=10)
#X_train = pd.get_dummies(X_train, columns=['log_amount_binned', 'log_age_amount_binned'])

In [17]:
# Binning log(amount) for the training set
X_train['binned_log_amount'] = pd.cut(X_train['amount'], bins=10)


print(X_train)

        category    amount binned_log_amount
404873        12  2.473171    (2.279, 3.039]
140213        12  3.095578    (3.039, 3.799]
431098        12  4.029273    (3.799, 4.559]
264017         9  3.752793    (3.039, 3.799]
296327        12  3.147595    (3.039, 3.799]
...          ...       ...               ...
111898         7  3.313822    (3.039, 3.799]
262686        12  2.805177    (2.279, 3.039]
370594        12  3.413784    (3.039, 3.799]
133844        12  3.138100    (3.039, 3.799]
123736        12  4.146779    (3.799, 4.559]

[352465 rows x 3 columns]


In [18]:
# Binning log(amount) for the cross validation set 
X_cv['binned_log_amount'] = pd.cut(X_cv['amount'], bins=10)


In [19]:
# Binning log(amount) for the test set 
X_test['binned_log_amount'] = pd.cut(X_test['amount'], bins=10)


In [20]:
# Plot Histogram to see the distribution of data
#plt.hist(X_train['log_age_amount'], bins = 50)

In [21]:
#seeing some patterns in the cv dataset where transactions are a fraud 
#fraud_data = cv_set[cv_set['fraud'] == 1]

#plt.hist(fraud_data['category'], bins=50, alpha=0.7, color='blue')
#plt.title('Histogram of log_age_amount for fraud = 1')
#plt.xlabel('category')
#plt.ylabel('Frequency')
#plt.show()

In [22]:
from sklearn.preprocessing import StandardScaler

# Convert binned intervals to categorical codes
X_train['binned_log_amount_code'] = X_train['binned_log_amount'].cat.codes

# Standardize the numeric codes
scaler = StandardScaler()
X_train[['binned_log_amount_code']] = scaler.fit_transform(X_train[['binned_log_amount_code']])
#X_train = scaler.fit_transform(X_train[['category', 'amount', 'binned_log_amount_code']])
X_train = X_train.drop(columns = 'binned_log_amount')

In [23]:
from sklearn.preprocessing import StandardScaler

# Convert binned intervals to categorical codes
X_cv['binned_log_amount_code'] = X_cv['binned_log_amount'].cat.codes

# Standardize the numeric codes
scaler = StandardScaler()
X_cv[['binned_log_amount_code']] = scaler.fit_transform(X_cv[['binned_log_amount_code']])
#X_cv = scaler.fit_transform(X_cv[['category', 'amount', 'binned_log_amount_code']])

X_cv = X_cv.drop(columns = 'binned_log_amount')
print(X_cv)



        category    amount  binned_log_amount_code
542826        12  2.475698               -0.942246
198605         0  4.159820                0.763041
169541         2  4.048650                0.763041
476894        12  3.864722                0.763041
482791         4  5.709002                2.468327
...          ...       ...                     ...
285236        13  6.180141                2.468327
409345        10  5.707841                2.468327
412734         7  4.500587                1.615684
427654         4  5.497004                2.468327
55957         13  5.931290                2.468327

[121089 rows x 3 columns]


In [24]:

from sklearn.preprocessing import StandardScaler
# Convert binned intervals to categorical codes
X_test['binned_log_amount_code'] = X_test['binned_log_amount'].cat.codes

# Standardize the numeric codes
scaler = StandardScaler()
X_test[['binned_log_amount_code']] = scaler.fit_transform(X_test[['binned_log_amount_code']])
#X_test = scaler.fit_transform(X_test[['category', 'amount', 'binned_log_amount_code']])

X_test = X_test.drop(columns = 'binned_log_amount')

print(X_test)



        category    amount  binned_log_amount_code
509545        12  3.496811               -0.064878
267149        12  3.383373               -0.064878
574347        12  3.752558                0.798234
127296        12  3.733614                0.798234
530626        12  2.149434               -0.927990
...          ...       ...                     ...
465368         4  6.384587                3.387569
584518         6  4.856318                1.661346
570587        10  6.499125                3.387569
288705        14  4.090002                0.798234
205904        10  6.489630                3.387569

[121089 rows x 3 columns]


In [25]:
#def estimate_gaussian(X): 
#    """
#    Calculates mean and variance of all features 
#    in the dataset
    
#    Args:
#        X (ndarray): (m, n) Data matrix
    
#    Returns:
#        mu (ndarray): (n,) Mean of all features
#        var (ndarray): (n,) Variance of all features
#    """

#    m, n = X.shape
    
    ### START CODE HERE ### 
#    mu = np.sum(X, axis = 0)
#    mu = mu / m
#    var = np.sum((X - mu)**2, axis = 0)
#    var = var / m
    
    ### END CODE HERE ### 
        
#    return mu, var

In [26]:
import numpy as np

def select_threshold(y_val, p_val):
    """
    Finds the best threshold to use for selecting outliers 
    based on the results from a validation set (p_val) 
    and the ground truth (y_val).
    
    Args:
        y_val (ndarray): Ground truth on validation set
        p_val (ndarray): Results on validation set (probabilities)
        
    Returns:
        epsilon (float): Threshold chosen 
        best_F1 (float): Best F1 score by choosing epsilon as threshold
    """ 

    best_epsilon = 0
    best_F1 = 0
    
    # Calculate the step size safely
    min_p = min(p_val)
    max_p = max(p_val)
    step_size = (max_p - min_p) / 1000
    
    if step_size == 0:
        # If step size is zero, it means all p_val are identical
        # Use a very small fixed step size to explore potential thresholds
        step_size = 1e-10

    for epsilon in np.arange(min_p, max_p, step_size):
    
        # Predictions based on the current threshold
        prediction = (p_val < epsilon)
        
        # True positives, false positives, false negatives
        tp = np.sum((prediction == 1) & (y_val == 1))
        fp = np.sum((prediction == 1) & (y_val == 0))
        fn = np.sum((prediction == 0) & (y_val == 1))
        
        # Calculate precision and recall safely
        prec = tp / (tp + fp) if (tp + fp) > 0 else 0
        rec = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        # Calculate F1 score safely
        if prec > 0 or rec > 0:  # Only calculate F1 if both prec and rec are non-zero
            F1 = (2 * prec * rec) / (prec + rec)
        else:
            F1 = 0
        
        # Update best F1 and epsilon if the current F1 is better
        if F1 > best_F1:
            best_F1 = F1
            best_epsilon = epsilon
    
    return best_epsilon, best_F1



In [27]:

def multivariate_gaussian_pdf(X, mean_vector, cov_matrix):
    """
    Calculate the multivariate Gaussian probability density function for a dataset X.

    Parameters:
    - X: A 2D array where each row represents a data point.
    - mean_vector: Mean vector of the Gaussian distribution.
    - cov_matrix: Covariance matrix of the Gaussian distribution.

    Returns:
    - probs: A 1D array of probabilities for each data point.
    """
    n = X.shape[1]  # Number of features
    
    # Ensure mean_vector is a 1-dimensional array
    mean_vector = np.asarray(mean_vector).reshape(-1)
    
    # Ensure cov_matrix is a 2-dimensional array
    cov_matrix = np.atleast_2d(cov_matrix)
    
    # Check dimensions
    assert mean_vector.shape[0] == n, f"Mean vector dimensions do not match number of features: {mean_vector.shape[0]} != {n}"
    assert cov_matrix.shape == (n, n), f"Covariance matrix dimensions are incorrect: {cov_matrix.shape} != ({n}, {n})"
    
    # Calculate the exponent
    diff = X - mean_vector
    exponent = -0.5 * np.sum(diff @ np.linalg.inv(cov_matrix) * diff, axis=1)
    
    # Calculate the denominator
    denom = np.sqrt((2 * np.pi) ** n * np.linalg.det(cov_matrix))
    
    # Calculate the probabilities
    probs = np.exp(exponent) / denom
    return probs

In [28]:
#Calculate mean and covariance matrix 
mu_high = np.mean(X_train, axis=0)
cov_matrix = np.cov(X_train, rowvar=False)

In [29]:

# Evaluate the probabilites for the training set
p_high = multivariate_gaussian_pdf(X_train, mu_high, cov_matrix)
print(p_high)


404873    0.10822
140213    0.10822
431098    0.10822
264017    0.10822
296327    0.10822
           ...   
111898    0.10822
262686    0.10822
370594    0.10822
133844    0.10822
123736    0.10822
Length: 352465, dtype: float64


In [30]:
# Evaluate the probabilites for the cross validation set
p_val_high = multivariate_gaussian_pdf(X_cv, mu_high, cov_matrix)
print(p_val_high)



542826    0.10822
198605    0.10822
169541    0.10822
476894    0.10822
482791    0.10822
           ...   
285236    0.10822
409345    0.10822
412734    0.10822
427654    0.10822
55957     0.10822
Length: 121089, dtype: float64


In [31]:
# Find the best threshold
epsilon_high, F1_high = select_threshold(Y_cv, p_val_high)

print('Best epsilon found using cross-validation: %e'% epsilon_high)
print('Best F1 on Cross Validation Set:  %f'% F1_high)
print('# Anomalies found: %d'% sum(p_high < epsilon_high))

Best epsilon found using cross-validation: 0.000000e+00
Best F1 on Cross Validation Set:  0.000000
# Anomalies found: 0


In [32]:
from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
X_train_reduced = pca.fit_transform(X_train)

# Recalculate mean and covariance
mu_reduced = np.mean(X_train_reduced, axis=0)
cov_matrix_reduced = np.cov(X_train_reduced, rowvar=False)

In [33]:
X_cv_reduced = pca.transform(X_cv)

In [34]:
print(X_cv_reduced)

[[-1.01590249 -1.04566099]
 [11.17164591  0.05041011]
 [ 9.17501186  0.19023553]
 ...
 [ 4.30348873  1.65523397]
 [ 7.42580238  2.62763375]
 [-1.49068481  3.8814889 ]]


In [35]:
X_test_reduced = pca.transform(X_test)

In [36]:

# Evaluate the probabilites for the training set
p_high = multivariate_gaussian_pdf(X_train_reduced, mu_reduced, cov_matrix_reduced)
print(p_high)

# Evaluate the probabilites for the cross validation set
p_val_high = multivariate_gaussian_pdf(X_cv_reduced, mu_reduced, cov_matrix_reduced)
print(p_val_high)

# Find the best threshold
epsilon_high, F1_high = select_threshold(Y_cv, p_val_high)

print('Best epsilon found using cross-validation: %e'% epsilon_high)
print('Best F1 on Cross Validation Set:  %f'% F1_high)
print('# Anomalies found: %d'% sum(p_high < epsilon_high))

[0.03468463 0.04020796 0.02189076 ... 0.03847331 0.0400322  0.02043988]
[2.94828868e-02 1.50319473e-05 1.98731624e-04 ... 6.01404959e-03
 1.75264861e-04 4.88649372e-04]
Best epsilon found using cross-validation: 1.724611e-04
Best F1 on Cross Validation Set:  0.365362
# Anomalies found: 9819


In [37]:
# Evaluate the probabilites for the test set
p_val_high = multivariate_gaussian_pdf(X_test_reduced, mu_reduced, cov_matrix_reduced)
print(p_val_high)

# Find the best threshold
epsilon_high, F1_high = select_threshold(Y_test, p_val_high)

print('Best epsilon found using cross-validation: %e'% epsilon_high)
print('Best F1 on Cross Validation Set:  %f'% F1_high)
print('# Anomalies found: %d'% sum(p_high < epsilon_high))

[4.01635981e-02 4.05574905e-02 2.94773543e-02 ... 7.90638140e-05
 1.35875336e-02 8.04198090e-05]
Best epsilon found using cross-validation: 1.724608e-04
Best F1 on Cross Validation Set:  0.338778
# Anomalies found: 9819
