In [4]:
import pandas as pd #To hand with data 
import numpy as np #To math 
import math
import random
import seaborn as sns #to visualization
import matplotlib.pyplot as plt # to plot the graphs
import matplotlib.gridspec as gridspec # to do the grid of plots

In [54]:
# External node of the tree
class ExNode:
    def __init__(self, size):
        self.size = size

# Internal node of the tree
class InNode:
    def __init__(self, left, right, splitAtt, splitVal):
        self.left = left
        self.right = right
        self.splitAtt = splitAtt
        self.splitVal = splitVal

# Function to create the isolation forest
def iForest(X, noOfTrees, sampleSize):
    forest = []
    hlim = math.ceil(math.log(sampleSize, 2)) # Height limit
    for i in range(noOfTrees): # For each tree
        X_train = X.sample(sampleSize)
        forest.append(iTree(X_train, 0, hlim))
    return forest

# Function to create the isolation tree
def iTree(X, currHeight, hlim):
    if currHeight >= hlim or len(X) <= 1: # If the height limit is reached or the size of the dataset is 1
        return ExNode(len(X)) # Return an external node
    else:
        Q = X.columns # List of features
        q = random.choice(Q) # Randomly select a feature
        p = random.choice(X[q].unique()) # Randomly select a feature and a value from the feature
        X_l = X[X[q] < p] # Left branch of the tree
        X_r = X[X[q] >= p] # Right branch of the tree
        return InNode(iTree(X_l, currHeight + 1, hlim), iTree(X_r, currHeight + 1, hlim), q, p)

# Function to calculate the path length of an instance x
def pathLength(x, Tree, currHeight):
    if isinstance(Tree, ExNode):
        return currHeight
    a = Tree.splitAtt # Attribute at internal node
    if x[a] < Tree.splitVal: # Follow left branch
        return pathLength(x, Tree.left, currHeight + 1) 
    else:
        return pathLength(x, Tree.right, currHeight + 1)

# Function to calculate the anomaly score of the instances
def isolation_forest(df, no_of_trees, sample_size):
    ifor = iForest(df, no_of_trees, sample_size)  # Forest of trees
    anomaly_scores = []
    for _, row in df.iterrows(): # For each instance in the dataset
        path_lengths = []
        for tree in ifor: # For each tree in the forest
            path_lengths.append(pathLength(row, tree, 0))
        anomaly_score = np.mean(path_lengths) # Average of path lengths
        anomaly_scores.append(anomaly_score) # Save the anomaly score of the instance
    return anomaly_scores

In [5]:
#loading the data
train = pd.read_csv("dataset/fraudTrain.csv.zip")
test = pd.read_csv("dataset/fraudTest.csv.zip")

In [6]:
train['trans_date_trans_time']=pd.to_datetime(train['trans_date_trans_time'])
train['trans_date']=train['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
train['trans_date']=pd.to_datetime(train['trans_date'])
train['dob']=pd.to_datetime(train['dob'])

test['trans_date_trans_time']=pd.to_datetime(test['trans_date_trans_time'])
test['trans_date']=test['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
test['trans_date']=pd.to_datetime(test['trans_date'])
test['dob']=pd.to_datetime(test['dob'])
test.trans_date.head(),test.dob.head(),train.trans_date.head(),train.dob.head()

train.drop("Unnamed: 0",axis=1,inplace=True)
test.drop("Unnamed: 0",axis=1,inplace=True)
train.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_date
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,2019-01-01
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,2019-01-01
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,2019-01-01
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,2019-01-01
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,2019-01-01


In [7]:
total = pd.concat([test,train])
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1852394 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   cc_num                 int64         
 2   merchant               object        
 3   category               object        
 4   amt                    float64       
 5   first                  object        
 6   last                   object        
 7   gender                 object        
 8   street                 object        
 9   city                   object        
 10  state                  object        
 11  zip                    int64         
 12  lat                    float64       
 13  long                   float64       
 14  city_pop               int64         
 15  job                    object        
 16  dob                    datetime64[ns]
 17  trans_num              object        
 18  unix_time             

In [9]:
total['trans_month'] = pd.DatetimeIndex(total['trans_date']).month
total['trans_year'] = pd.DatetimeIndex(total['trans_date']).year

In [10]:
total

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_date,trans_month,trans_year
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0,2020-06-21,6,2020
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0,2020-06-21,6,2020
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,...,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.495810,-74.196111,0,2020-06-21,6,2020
3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,...,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0,2020-06-21,6,2020
4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0,2020-06-21,6,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,entertainment,15.56,Erik,Patterson,M,162 Jessica Row Apt. 072,Hatch,...,Geoscientist,1961-11-24,440b587732da4dc1a6395aba5fb41669,1371816728,36.841266,-111.690765,0,2020-06-21,6,2020
1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,food_dining,51.70,Jeffrey,White,M,8617 Holmes Terrace Suite 651,Tuscarora,...,"Production assistant, television",1979-12-11,278000d2e0d2277d1de2f890067dcc0a,1371816739,38.906881,-78.246528,0,2020-06-21,6,2020
1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,food_dining,105.93,Christopher,Castaneda,M,1632 Cohen Drive Suite 639,High Rolls Mountain Park,...,Naval architect,1967-08-30,483f52fe67fabef353d552c1e662974c,1371816752,33.619513,-105.130529,0,2020-06-21,6,2020
1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,Joseph,Murray,M,42933 Ryan Underpass,Manderson,...,Volunteer coordinator,1980-08-18,d667cdcbadaaed3da3f4020e83591c83,1371816816,42.788940,-103.241160,0,2020-06-21,6,2020


In [11]:
total_copy = total.copy()

In [12]:
from sklearn.preprocessing import LabelEncoder

columns_to_convert = total.columns[total.dtypes == 'object']
label_encoders = {}
for column in columns_to_convert:
    label_encoders[column] = LabelEncoder()
    total_copy[column] = label_encoders[column].fit_transform(total[column])

In [14]:
columns_to_convert = total.columns[total.dtypes == 'datetime64[ns]']
label_encoders = {}
for column in columns_to_convert:
    label_encoders[column] = LabelEncoder()
    total_copy[column] = label_encoders[column].fit_transform(total[column])

In [15]:
total_copy

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_date,trans_month,trans_year
0,1274791,2291163933867244,319,10,2.86,160,118,1,364,168,...,288,431,329864,1371816865,33.986391,-81.200714,0,536,6,2020
1,1274792,3573030041201292,591,10,29.84,172,472,0,378,16,...,406,820,363509,1371816873,39.450498,-109.960431,0,536,6,2020
2,1274793,3598215285024754,611,5,41.28,27,254,0,935,64,...,272,481,1447341,1371816893,40.495810,-74.196111,0,536,6,2020
3,1274794,3591919803438423,222,9,60.05,46,472,1,339,814,...,422,778,240955,1371816915,28.812398,-80.883061,0,536,6,2020
4,1274795,3526826139003047,292,13,3.19,258,266,1,591,262,...,205,206,636661,1371816917,44.959148,-85.884734,0,536,6,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,1274786,30263540414123,499,0,15.56,122,336,1,158,333,...,216,305,492241,1371816728,36.841266,-111.690765,0,536,6,2020
1296671,1274787,6011149206456997,2,1,51.70,162,468,1,870,824,...,362,642,285107,1371816739,38.906881,-78.246528,0,536,6,2020
1296672,1274788,3514865930894695,599,1,105.93,75,69,1,162,349,...,309,420,522540,1371816752,33.619513,-105.130529,0,536,6,2020
1296673,1274789,2720012583106919,509,1,74.90,181,308,1,437,477,...,488,651,1551366,1371816816,42.788940,-103.241160,0,536,6,2020


In [17]:
total_copy

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_date,trans_month,trans_year
0,1274791,2291163933867244,319,10,2.86,160,118,1,364,168,...,288,431,329864,1371816865,33.986391,-81.200714,0,536,6,2020
1,1274792,3573030041201292,591,10,29.84,172,472,0,378,16,...,406,820,363509,1371816873,39.450498,-109.960431,0,536,6,2020
2,1274793,3598215285024754,611,5,41.28,27,254,0,935,64,...,272,481,1447341,1371816893,40.495810,-74.196111,0,536,6,2020
3,1274794,3591919803438423,222,9,60.05,46,472,1,339,814,...,422,778,240955,1371816915,28.812398,-80.883061,0,536,6,2020
4,1274795,3526826139003047,292,13,3.19,258,266,1,591,262,...,205,206,636661,1371816917,44.959148,-85.884734,0,536,6,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,1274786,30263540414123,499,0,15.56,122,336,1,158,333,...,216,305,492241,1371816728,36.841266,-111.690765,0,536,6,2020
1296671,1274787,6011149206456997,2,1,51.70,162,468,1,870,824,...,362,642,285107,1371816739,38.906881,-78.246528,0,536,6,2020
1296672,1274788,3514865930894695,599,1,105.93,75,69,1,162,349,...,309,420,522540,1371816752,33.619513,-105.130529,0,536,6,2020
1296673,1274789,2720012583106919,509,1,74.90,181,308,1,437,477,...,488,651,1551366,1371816816,42.788940,-103.241160,0,536,6,2020


In [18]:
total_1 = total.copy()
total = total_copy.copy()

In [19]:
total['is_fraud'].value_counts()

0    1842743
1       9651
Name: is_fraud, dtype: int64

In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import f1_score

df1 = total.sample(5000)

# Create copies of df1 for different experiments
df2 = df1.copy()
df3 = df1.copy()
df4 = df1.drop(columns=['is_fraud']).copy()

# Function to classify based on threshold
def classify_anomaly_score(score, threshold):
    if score <= threshold:
        return 1  # Fraud
    else:
        return 0  # Normal

anomaly_scores = isolation_forest(df2, no_of_trees=100, sample_size=256)
threshold = np.quantile(anomaly_scores, 0.005)
        
# Classify ano{maly scores
df2['AnomalyScore'] = anomaly_scores
df2['predicted_class'] = df2['AnomalyScore'].apply(lambda score: classify_anomaly_score(score, threshold))

ground_truth_labels = df3['is_fraud']
# Assuming 'predicted_class_labels' contains the predicted class labels (0 for normal, 1 for fraud)
predicted_class_labels = df2['predicted_class']

precision, recall, _ = precision_recall_curve(ground_truth_labels, predicted_class_labels)

# Calculate AUPR
aupr_score = auc(recall, precision)

f1 = f1_score(df2['predicted_class'], df3['is_fraud'])

print(f1)
print(aupr_score)

[<__main__.InNode object at 0x000001B61E317DF0>, <__main__.InNode object at 0x000001B61C78A670>, <__main__.InNode object at 0x000001B61DD3ECD0>, <__main__.InNode object at 0x000001B618A6F8B0>, <__main__.InNode object at 0x000001B618A78490>, <__main__.InNode object at 0x000001B6163195E0>, <__main__.InNode object at 0x000001B618A786D0>, <__main__.InNode object at 0x000001B616311520>, <__main__.InNode object at 0x000001B612604B20>, <__main__.InNode object at 0x000001B61C78B880>, <__main__.InNode object at 0x000001B61DD3D550>, <__main__.InNode object at 0x000001B61A02CD30>, <__main__.InNode object at 0x000001B610A6CBE0>, <__main__.InNode object at 0x000001B610485BE0>, <__main__.InNode object at 0x000001B6226077F0>, <__main__.InNode object at 0x000001B625D45700>, <__main__.InNode object at 0x000001B622606760>, <__main__.InNode object at 0x000001B61048B9D0>, <__main__.InNode object at 0x000001B6131C7B50>, <__main__.InNode object at 0x000001B625D45B20>, <__main__.InNode object at 0x000001B61A

In [None]:
df2['predicted_class'].value_counts()

0    9979
1      21
Name: predicted_class, dtype: int64

In [51]:
total

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_date,trans_month,trans_year
0,1274791,2291163933867244,319,10,2.86,160,118,1,364,168,...,288,431,329864,1371816865,33.986391,-81.200714,0,536,6,2020
1,1274792,3573030041201292,591,10,29.84,172,472,0,378,16,...,406,820,363509,1371816873,39.450498,-109.960431,0,536,6,2020
2,1274793,3598215285024754,611,5,41.28,27,254,0,935,64,...,272,481,1447341,1371816893,40.495810,-74.196111,0,536,6,2020
3,1274794,3591919803438423,222,9,60.05,46,472,1,339,814,...,422,778,240955,1371816915,28.812398,-80.883061,0,536,6,2020
4,1274795,3526826139003047,292,13,3.19,258,266,1,591,262,...,205,206,636661,1371816917,44.959148,-85.884734,0,536,6,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,1274786,30263540414123,499,0,15.56,122,336,1,158,333,...,216,305,492241,1371816728,36.841266,-111.690765,0,536,6,2020
1296671,1274787,6011149206456997,2,1,51.70,162,468,1,870,824,...,362,642,285107,1371816739,38.906881,-78.246528,0,536,6,2020
1296672,1274788,3514865930894695,599,1,105.93,75,69,1,162,349,...,309,420,522540,1371816752,33.619513,-105.130529,0,536,6,2020
1296673,1274789,2720012583106919,509,1,74.90,181,308,1,437,477,...,488,651,1551366,1371816816,42.788940,-103.241160,0,536,6,2020


In [52]:
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1852394 entries, 0 to 1296674
Data columns (total 25 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trans_date_trans_time  int64  
 1   cc_num                 int64  
 2   merchant               int32  
 3   category               int32  
 4   amt                    float64
 5   first                  int32  
 6   last                   int32  
 7   gender                 int32  
 8   street                 int32  
 9   city                   int32  
 10  state                  int32  
 11  zip                    int64  
 12  lat                    float64
 13  long                   float64
 14  city_pop               int64  
 15  job                    int32  
 16  dob                    int64  
 17  trans_num              int32  
 18  unix_time              int64  
 19  merch_lat              float64
 20  merch_long             float64
 21  is_fraud               int64  
 22  trans_date        

In [69]:
total.drop('category',axis=1,inplace=True)

In [83]:
def calculate_confusion_matrix(actual_labels, predicted_labels):
    true_positive = sum((a == 1) and (p == 1) for a, p in zip(actual_labels, predicted_labels))
    false_positive = sum((a == 0) and (p == 1) for a, p in zip(actual_labels, predicted_labels))
    false_negative = sum((a == 1) and (p == 0) for a, p in zip(actual_labels, predicted_labels))
    
    return true_positive, false_positive, false_negative

In [84]:
def calculate_f1_score(true_positive, false_positive, false_negative):
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    
    if precision + recall == 0:
        return 0  # To avoid division by zero
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
        return f1_score


In [90]:
def calculate_accuracy(true_labels, predicted_labels):
    correct_predictions = 0
    total_predictions = len(true_labels)
    
    for true_label, predicted_label in zip(true_labels, predicted_labels):
        if true_label == predicted_label:
            correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

In [86]:
import numpy as np
import pandas as pd
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import f1_score

df1 = total.sample(50000)

# Create copies of df1 for different experiments
df2 = df1.copy()
df3 = df1.copy()
df4 = df1.drop(columns=['is_fraud']).copy()

# Function to classify based on threshold
def classify_anomaly_score(score, threshold):
    if score <= threshold:
        return 1  # Fraud
    else:
        return 0  # Normal

anomaly_scores = isolation_forest(df2, no_of_trees=100, sample_size=256)
threshold = np.quantile(anomaly_scores, 0.005)
        
# Classify ano{maly scores
df2['AnomalyScore'] = anomaly_scores
df2['predicted_class'] = df2['AnomalyScore'].apply(lambda score: classify_anomaly_score(score, threshold))

ground_truth_labels = df3['is_fraud']
predicted_class_labels = df2['predicted_class']

precision, recall, _ = precision_recall_curve(ground_truth_labels, predicted_class_labels)

# Calculate AUPR
aupr_score = auc(recall, precision)

true_positive, false_positive, false_negative = calculate_confusion_matrix(ground_truth_labels, predicted_class_labels)
print("True Positives:", true_positive)
print("False Positives:", false_positive)
print("False Negatives:", false_negative)

f1_score = calculate_f1_score(true_positive, false_positive, false_negative)

print("F1 Score :", f1_score)
print("AUPR Score : ", aupr_score)

True Positives: 144
False Positives: 110
False Negatives: 120
F1 Score : 0.5559845559845561
AUPR Score :  0.5573918396564066


In [91]:
accuracy = calculate_accuracy(ground_truth_labels, predicted_class_labels)
print("Accuracy:", accuracy)

Accuracy: 0.9406


In [75]:
df1 = total.sample(70000)

# Create copies of df1 for different experiments
df2 = df1.copy()
df3 = df1.copy()
df4 = df1.drop(columns=['is_fraud']).copy()

In [1]:
# Function to classify based on threshold
def classify_anomaly_score(score, threshold):
    if score <= threshold:
        return 1  # Fraud
    else:
        return 0  # Normal

values = [0.002, 0.005, 0.01, 0.02, 0.05]

df1 = total.sample(50000)

# Create copies of df1 for different experiments
df3 = df1.copy()

# Lists to store results
aupr_scores = []
f1_scores = []

df2 = df1.copy()
df2.drop('is_fraud', axis=1, inplace=True)

anomaly_scores = isolation_forest(df2, no_of_trees=100, sample_size=256)
df2['AnomalyScore'] = anomaly_scores

# Loop over different threshold values
for val in values:
    
    threshold = np.quantile(anomaly_scores, val)
    # Classify anomaly scores
    df2['predicted_class'] = df2['AnomalyScore'].apply(lambda score: classify_anomaly_score(score, threshold))
    
    ground_truth_labels = df3['is_fraud']
    predicted_class_labels = df2['predicted_class']
    
    true_positive, false_positive, false_negative = calculate_confusion_matrix(ground_truth_labels, predicted_class_labels)
    print("Threshold:", threshold)
    print("True Positives:", true_positive)
    print("False Positives:", false_positive)
    print("False Negatives:", false_negative)

    f1_score = calculate_f1_score(true_positive, false_positive, false_negative)
    f1_scores.append(f1_score)

    print("F1 Score :", f1_score)
    
    precision, recall, _ = precision_recall_curve(ground_truth_labels, predicted_class_labels)

    # Calculate AUPR
    aupr_score = auc(recall, precision)
    aupr_scores.append(aupr_score)
    print("AUPR Score : ", aupr_score)
    print()

# Plot AUPR and F1 scores
plt.plot(values, aupr_scores, label='AUPR Score')
plt.plot(values, f1_scores, label='F1 Score')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('AUPR and F1 Scores vs. Threshold')
plt.legend()
plt.grid(True)
plt.show()

NameError: name 'total' is not defined