In [35]:
import pandas as pd

credit_data = pd.read_csv("creditcard.csv")
# credit_data = credit_data[1000:]
credit_data.columns

## The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions ##

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [36]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 42)

features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10','V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20','V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']

salient_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10']

y = credit_data.Class
X = credit_data[features]
# X = credit_data[salient_features]
print(y.shape)
print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# With smote resampling
print(X_train.shape)
print(y_train.shape)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print(X_train_res.shape)


(284807,)
(284807, 10)
(213605, 10)
(213605,)
(426448, 10)


In [37]:
## Logistic Regression

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced', solver='liblinear', C=2.5)
model.fit(X_train_res, y_train_res)

In [38]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
y_pred = model.predict_proba(X_test)[:,1]
print(y_pred)

# Move decision boundary from threshold = 0.5 (default choice)
threshold = 0.99
y_pred = (y_pred >= threshold).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
precision = precision_score(y_test, y_pred)
print(f"Test Precision: {precision:.4f}")
recall = recall_score(y_test, y_pred)
print(f"Test Recall: {recall:.4f}")

[6.99689632e-02 1.18911901e-01 1.08481575e-01 ... 4.25977054e-01
 7.34466116e-02 1.66795044e-04]
Test Accuracy: 0.9983
Test Precision: 0.4639
Test Recall: 0.6937


In [8]:
# Visualize high-dimensional data in lower dimensional space
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import tqdm

# Apply t-SNE
# tsne = TSNE(n_components=2, random_state=42, perplexity=30, verbose=1)
# X_embedded = tsne.fit_transform(X)
# plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y)
# plt.title("Visualization of t-SNE embeddings of dimension two")
# plt.xlabel("dim 0")
# plt.ylabel("dim 1")
# plt.show()

In [None]:
# define the support vector classifier --> hyperplane will divide the space between 'fraud' and 'non-fraud' datapoints.
from sklearn.svm import SVC

# trying with radial basis function kernel, instead of linear.
# svc = SVC(kernel='rbf', random_state=1)
svc = SVC(kernel='linear', random_state=1)

# fit classifier to training data
svc.fit(X_train_res, y_train_res)

In [None]:
# evaluation 
y_pred = svc.predict(X_test)

In [None]:
print(y_pred[35])

In [None]:
# determine accuracy on val data
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

In [None]:
# % of times a positive fraud prodiction was correct
precision = precision_score(y_test, y_pred)
print(f"Test Precision: {precision:.4f}")

In [None]:
# % of actual positives (fraud occured) correctly classified. We can see it's pretty low given the dataset is unbalanced
recall = recall_score(y_test, y_pred)
print(f"Test Recall: {recall:.4f}")

In [None]:
## Now train and evaluate using SMOTE resampling.

# define the support vector classifier --> hyperplane will divide the space between 'fraud' and 'non-fraud' datapoints.
from sklearn.svm import SVC
from tqdm import tqdm

# trying with radial basis function kernel, instead of linear.
svc = SVC(kernel='linear', random_state=1)


svc.fit(X_train_res, y_train_res)
y_pred = scv.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
precision = precision_score(y_test, y_pred)
print(f"Test Precision: {precision:.4f}")
recall = recall_score(y_test, y_pred)
print(f"Test Recall: {recall:.4f}")

In [None]:
# Now we seek to account for the fact that the dataset is unbalanced, with slack variables and associated penalty, "C".
svc = SVC(C=5.0, kernel='linear', random_state=1)
# fit classifier to training data
svc.fit(X_train, y_train)

# % of actual positives (fraud occured) correctly classified. We can see it's pretty low given the dataset is unbalanced
recall = recall_score(y_test, predictions)
print(f"Test Recall: {recall:.4f}")

In [39]:
## Try XGBoost
import xgboost as xgb

model = xgb.XGBClassifier(eval_metric='mlogloss')
model.fit(X_train_res, y_train_res)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
precision = precision_score(y_test, y_pred)
print(f"Test Precision: {precision:.4f}")
recall = recall_score(y_test, y_pred)
print(f"Test Recall: {recall:.4f}")

Test Accuracy: 0.9976
Test Precision: 0.3660
Test Recall: 0.7748


In [None]:
## SMOTE (synthetic minority over-sampling technique) oversampling: technique for oversampling a dataset in a classification problem. 
    # Take a sample from dataset, and consider k nearest neighbors.
    # Create synthetic datapoint by taking vector between current point and one of k neighbors.
    # Multiply vector by a random number in range of [0, 1]
    # Features should be continuous (which they are for the case of this fraud classification problem).




In [None]:
Test Accuracy: 0.9974
Test Precision: 0.3542
Test Recall: 0.8889