<a href="https://colab.research.google.com/github/laaksonenl/machine-learning/blob/master/credit_card_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Credit card fraud detection with SVM

Classify credit card transactions with highly imbalanced dataset (only minor portion of transactions are frauds).

The data is anonymized and run through PCA. All V-prefix features are principal components obtained with PCA.

In [19]:
! pip install -q kaggle
from google.colab import files

files.upload()

! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

! kaggle datasets download -d 'mlg-ulb/creditcardfraud'

Saving kaggle.json to kaggle (2).json
mkdir: cannot create directory ‘/root/.kaggle’: File exists
creditcardfraud.zip: Skipping, found more recently modified local copy (use --force to force download)


In [20]:
! unzip /content/creditcardfraud.zip

Archive:  /content/creditcardfraud.zip
replace creditcard.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: creditcard.csv          


In [21]:
import pandas as pd

file_path = '/content/creditcard.csv'
data = pd.read_csv(file_path)

In [22]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [23]:
def calculate_positive_portion(data):
  fraud_pct =  data.value_counts()[1] / data.shape[0]
  print('Fraud portion', fraud_pct, '%')

calculate_positive_portion(data['Class'])

Fraud portion 0.001727485630620034 %


## Stratified data split

In [24]:
from sklearn.model_selection import train_test_split

X = data.drop('Class', axis=1)
y = data['Class']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [25]:
calculate_positive_portion(y_train)
calculate_positive_portion(y_test)

Fraud portion 0.001729245759178389 %
Fraud portion 0.0017204452090867595 %


### Data preprocessing and model training

With SVM models, it's important to scale the features, otherwise SVM will neglect the smaller feature scales.

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(), LinearSVC())

In [27]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)


In [28]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, X, y, cv=skf, scoring='recall')



In [30]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(scores)

Scores: [0.89795918 0.40816327 0.91836735 0.63265306 0.69387755 0.58
 0.92       0.57142857 0.65306122 0.55102041]
Mean: 0.6826530612244899
Standard deviation: 0.16653333831302158


### Model evaluation


In [None]:
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, classification_report, precision_recall_curve
import matplotlib.pyplot as plt
import numpy as np

def plot_precision_recall_curve(y_true, y_pred):
  precision, recall, _ = precision_recall_curve(y_true, y_pred)
  plt.step(recall, precision, color='b', alpha=0.2, where='post')
  plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.ylim([0.0, 1.1])
  plt.xlim([0.0, 1.0])

def report_results(clf, X, y):
  print('Number of normal transactions:', len(y[y == 0]))
  print('Number of frauds:', len(y[y == 1]))
  
  clf_report = classification_report(y_true, y_pred)
  print(clf_report)

  plot_precision_recall_curve(y_true, y_pred)
  plot_confusion_matrix(clf, X, y)
  plot_roc_curve(clf, X, y)