In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report, RocCurveDisplay
from  sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


#### Importing Data

In [None]:
creditCard_data = pd.read_csv('creditcard_data.csv', delimiter=",")
df = creditCard_data.copy()
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284806 entries, 0 to 284805
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284806 non-null  float64
 1   V1      284806 non-null  float64
 2   V2      284806 non-null  float64
 3   V3      284806 non-null  float64
 4   V4      284806 non-null  float64
 5   V5      284806 non-null  float64
 6   V6      284806 non-null  float64
 7   V7      284806 non-null  float64
 8   V8      284806 non-null  float64
 9   V9      284806 non-null  float64
 10  V10     284806 non-null  float64
 11  V11     284806 non-null  float64
 12  V12     284806 non-null  float64
 13  V13     284806 non-null  float64
 14  V14     284806 non-null  float64
 15  V15     284806 non-null  float64
 16  V16     284806 non-null  float64
 17  V17     284806 non-null  float64
 18  V18     284806 non-null  float64
 19  V19     284806 non-null  float64
 20  V20     284806 non-null  float64
 21  V21     28

In [None]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [None]:
df['Class'].value_counts()

0    284314
1       492
Name: Class, dtype: int64

In [None]:
legit = df[df['Class']==0]
fraud = df[df['Class']==1]

In [None]:
print(legit.shape)
print(fraud.shape)

(284314, 31)
(492, 31)


In [None]:
legit.Amount.describe()

count    284314.000000
mean         88.290570
std         250.105416
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [None]:
df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94837.928076,0.00826,-0.00627,0.012168,-0.007858,0.005453,0.002421,0.009631,-0.000986,0.004465,...,-0.000646,-0.001236,-2.7e-05,6.8e-05,0.000182,-7e-05,-8.6e-05,-0.000295,-0.000131,88.29057
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [None]:
legit_sample = legit.sample(n=492)

In [28]:
new_df = pd.concat([legit_sample, fraud], axis =0)

In [29]:
new_df.shape

(984, 31)

In [30]:
new_df.head

<bound method NDFrame.head of             Time        V1        V2        V3        V4        V5        V6  \
110165   71705.0  1.296244 -0.738406  0.264765 -0.928984 -0.707570  0.158986   
249431  154405.0 -0.173421 -0.513267  0.802355 -1.599968  0.089336  2.060343   
280564  169609.0  2.047028 -0.380621 -1.799626  0.017444  0.543362  0.132551   
128746   78823.0 -0.735559  0.459686  2.093094  1.015258  0.159731  0.371070   
90015    62835.0 -1.039130  1.439161  0.907863  0.350304 -0.457468 -0.926220   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

         

#### Splitting data into features and target

In [32]:
X = new_df.drop(columns = 'Class', axis=1)
Y = new_df['Class']

In [33]:
X.shape

(984, 30)

In [34]:
Y.shape

(984,)

In [64]:
## Splitting Data into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

#### Using Logistic Regression

In [36]:
model_lr = LogisticRegression(max_iter=120,random_state=0, n_jobs=20, solver='liblinear')

In [37]:
model_lr.fit(X_train, y_train)



In [38]:
trn_lr_pred = model_lr.predict(X_train)
trn_lr_acc = accuracy_score(trn_lr_pred, y_train)
print(round(trn_lr_acc*100, 2))

94.39


In [39]:
tst_lr_pred = model_lr.predict(X_test)
tst_lr_acc = accuracy_score(tst_lr_pred, y_test)
print(round(tst_lr_acc*100, 2))

93.85


##### Here accuracy on the train and the test set is almost same which means that model is neither overfitting nor underfitting.

In [40]:
print(classification_report(y_test,tst_lr_pred))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94       164
           1       0.97      0.91      0.94       161

    accuracy                           0.94       325
   macro avg       0.94      0.94      0.94       325
weighted avg       0.94      0.94      0.94       325



#### Using Random Forest Classifier

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [44]:
rf.fit(X_train, y_train)

In [45]:
y_pred = rf.predict(X_test)

In [47]:
trn_lr_pred = rf.predict(X_train)
trn_lr_acc = accuracy_score(trn_lr_pred, y_train)
print(round(trn_lr_acc*100, 2))

100.0


In [48]:
tst_lr_pred = rf.predict(X_test)
tst_lr_acc = accuracy_score(tst_lr_pred, y_test)
print(round(tst_lr_acc*100, 2))

93.85


In [46]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94       164
           1       0.97      0.91      0.94       161

    accuracy                           0.94       325
   macro avg       0.94      0.94      0.94       325
weighted avg       0.94      0.94      0.94       325



#### Using OneClassSVM

In [49]:
from sklearn.svm import OneClassSVM

In [51]:
svm = OneClassSVM(kernel='rbf', nu=0.1)
svm.fit(X_train)

In [52]:
y_pred = svm.predict(X_test)
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1


In [54]:
trn_lr_pred = svm.predict(X_train)
trn_lr_acc = accuracy_score(trn_lr_pred, y_train)
print(round(trn_lr_acc*100, 2))

44.61


In [55]:
tst_lr_pred = svm.predict(X_test)
tst_lr_acc = accuracy_score(tst_lr_pred, y_test)
print(round(tst_lr_acc*100, 2))

41.85


In [53]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.93      0.67       164
           1       0.68      0.16      0.25       161

    accuracy                           0.54       325
   macro avg       0.60      0.54      0.46       325
weighted avg       0.60      0.54      0.46       325



#### Isolation Forest

In [56]:
from sklearn.ensemble import IsolationForest

In [58]:
## Training
iforest = IsolationForest(n_estimators=100, contamination=0.1)
iforest.fit(X_train)



In [59]:
# Predict the classes of the test set
y_pred = iforest.predict(X_test)
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1


In [61]:
trn_lr_pred = iforest.predict(X_train)
trn_lr_acc = accuracy_score(trn_lr_pred, y_train)
print(round(trn_lr_acc*100, 2))

40.21


In [62]:
tst_lr_pred = iforest.predict(X_test)
tst_lr_acc = accuracy_score(tst_lr_pred, y_test)
print(round(tst_lr_acc*100, 2))

36.92


In [60]:
# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.99      0.73       164
           1       0.98      0.25      0.40       161

    accuracy                           0.63       325
   macro avg       0.78      0.62      0.57       325
weighted avg       0.77      0.63      0.57       325



#### Using DBSCAN 

In [63]:
from sklearn.cluster import DBSCAN

In [65]:
# Fit the DBSCAN model
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X_train)

In [66]:
# Extract the labels
labels = dbscan.labels_

In [67]:
# Assign labels to clusters
clusters = {}
for i, label in enumerate(labels):
    if label != -1:
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(i)

In [68]:
# Remove noise points
y_pred[y_pred == -1] = max(y_pred) + 1

In [70]:
# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.99      0.73       164
           1       0.98      0.25      0.40       161

    accuracy                           0.63       325
   macro avg       0.78      0.62      0.57       325
weighted avg       0.77      0.63      0.57       325



#### Evaluation for DBSCAN 

In [72]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score


In [73]:
# Compute the adjusted Rand index
ari = adjusted_rand_score(y_test, y_pred)


In [75]:
# Compute the normalized mutual information
nmi = normalized_mutual_info_score(y_test, y_pred)


In [76]:
print(f'Adjusted Rand index: {ari:.3f}')
print(f'Normalized mutual information: {nmi:.3f}')

Adjusted Rand index: 0.064
Normalized mutual information: 0.158


#### Using the Local Outlier Factor (LOF)

In [77]:
from sklearn.neighbors import LocalOutlierFactor

In [78]:
lof = LocalOutlierFactor(n_neighbors=5, contamination='auto')
lof.fit(X_train)

In [79]:
# Compute the LOF scores
lof_scores = -lof.negative_outlier_factor_


In [80]:
# Assign labels based on LOF scores
threshold = 1.5 * lof_scores.mean()
y_pred = [1 if score > threshold else 0 for score in lof_scores]


#### K neighbor classifier

In [84]:
from sklearn.neighbors import KNeighborsClassifier

In [85]:
# Fit the KNN model
n_neighbors = 5  # adjust this value depending on the characteristics of the data
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)

In [86]:
# Predict the test set labels
y_pred = knn.predict(X_test)


In [88]:
trn_lr_pred = knn.predict(X_train)
trn_lr_acc = accuracy_score(trn_lr_pred, y_train)
print(round(trn_lr_acc*100, 2))

76.33


In [89]:
tst_lr_pred = knn.predict(X_test)
tst_lr_acc = accuracy_score(tst_lr_pred, y_test)
print(round(tst_lr_acc*100, 2))

61.85


In [87]:
# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.59      0.61       164
           1       0.61      0.65      0.63       161

    accuracy                           0.62       325
   macro avg       0.62      0.62      0.62       325
weighted avg       0.62      0.62      0.62       325



In [90]:
from sklearn.naive_bayes import GaussianNB

In [91]:
nb = GaussianNB()
nb.fit(X_train, y_train)

In [92]:
# Predict the test set labels
y_pred = nb.predict(X_test)

In [94]:
trn_lr_pred = nb.predict(X_train)
trn_lr_acc = accuracy_score(trn_lr_pred, y_train)
print(round(trn_lr_acc*100, 2))

85.89


In [95]:
tst_lr_pred = nb.predict(X_test)
tst_lr_acc = accuracy_score(tst_lr_pred, y_test)
print(round(tst_lr_acc*100, 2))

87.08


In [93]:
# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.99      0.89       164
           1       0.98      0.75      0.85       161

    accuracy                           0.87       325
   macro avg       0.89      0.87      0.87       325
weighted avg       0.89      0.87      0.87       325

