## Data preprocessing

### **Credit card fraud detection using Scikit **

---



In this project, I will use different classification algorithm to predict the credit card fraud detection.

In [1]:
# Import relevant libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
sns.set()

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize,StandardScaler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_auc_score
import time
import warnings
warnings.filterwarnings('ignore')

In [3]:
# # Import data
# # install the opendatasets package
# !pip install opendatasets
# import opendatasets as od
# od.download("https://www.kaggle.com/mlg-ulb/creditcardfraud")

## **Exploration and visualization**

In [1]:
df=pd.read_csv('/content/drive/MyDrive/Github/Anamoly-detection/credit_card_detection/creditcard.csv')
pd.DataFrame(df)
df.head()

NameError: ignored

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.Class.value_counts()

In [None]:
df.Class.value_counts()

Lets look at the Class, which has two values 0 and 1. Class 0 means the transaction is valid and Class 1 means is transaction is fradulent. As expected the distribution is highly unbalanced as most of the transactions are valid and only a tiny fractions of the transactions are fradulent.

In [None]:
# get the set of distinct classes
labels=df.Class.unique()

# get the count of each class
sizes = df.Class.value_counts()

# plot the class value counts
plt.pie(x=sizes,labels=labels,autopct='%1.1f%%')

plt.show()

In [None]:
## plotting 

In [None]:
for i in df.columns[0:30]:
  plt.figure(figsize=(10,6))
  sns.boxplot(x=df['Class'],y=df[i])
  plt.title(i,fontsize=20)
  plt.show()

In [None]:
df.Class.value_counts() 

In [None]:
print (round(df.Class.value_counts()[1]/df.shape[0]*100,2), 'percent of total transactions are fraudulent.')

In [None]:
df_legit=df[df.Class==0]
df_legit.head()

In [None]:
df_fraud=df[df.Class==1]
df_fraud.head()

In [None]:
df1_legit=df_legit.describe()
df1_legit

In [None]:
df1_fraud=df_fraud.describe()
df1_fraud

In [None]:
plt.figure(figsize=(10,8))
df1_legit.iloc[1,1:29].plot(kind='bar',color='r',label='legit')
df1_fraud.iloc[1,1:29].plot(kind='bar',color='g',label='fraud')
plt.legend()

In [None]:
for i in df.columns[0:30]:
  sns.relplot(y=df[i],x=df['Class'],col=df['Class'])
  plt.show()

### Dataset Preprocessing

In [None]:
# Data preprocessing such as scaling/normalization is typically useful for linear models to accelerate the training convergence
# standardize features 
x=StandardScaler().fit_transform(df.iloc[:,1:30])


In [None]:
#feature 
x=df.iloc[:,1:30]

In [None]:
y=df.iloc[:,30]

In [None]:
#data normalization
x=normalize(x,norm='l1')

In [None]:
#print the shape of the features matrix and the labels vector
print('x.shape=',x.shape,'y.shape=',y.shape)

## Data Train/test Split

In [None]:
x=df.iloc[:,0:29]
x.head()

In [None]:
y=df[['Class']]
y.head()

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8, stratify=y,random_state=1)
print('x_train_shape=',x_train.shape,'y_train_shape=',y_train.shape)
print('x_test.shape=',x_test.shape,'y_test.shape=',y_test.shape)

In [None]:
## Number of fraud cases in train and test data
print(f'fraction of fraud cases in train data: {y_train[y_train==1].sum()/y_train.shape[0]} and fraction of fraud cases in train data: {y_test[y_test==1].sum()/y_test.shape[0]}')

In [None]:
## train-validate-split
train_x,val_x,train_y,val_y=train_test_split(x_train,y_train,train_size=0.8, stratify=y_train,random_state=1)
print('x_train_shape=',train_x.shape,'y_train_shape=',train_y.shape)
print('x_test.shape=',val_x.shape,'y_test.shape=',val_y.shape)

In [None]:
print('fraction of fraud cases in train data:', train_y[train_y==1].sum()/train_y.shape[0])
print('fraction of fraud cases in train data:', val_y[val_y==1].sum()/val_y.shape[0])

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,confusion_matrix,plot_confusion_matrix,plot_precision_recall_curve,ConfusionMatrixDisplay

In [None]:
model_LR=LogisticRegression()
model_LR.fit(train_x,train_y)
y_pred=model_LR.predict(val_x)

In [None]:
score=accuracy_score(val_y,y_pred)
print(score)

In [None]:
f1_score=f1_score(val_y,y_pred)
f1_score

In [None]:
precision_score(val_y,y_pred)

In [None]:
recall_score(val_y,y_pred)

In [None]:
confusion_matrix(val_y,y_pred)

In [None]:
val_y.Class.unique()

In [None]:
ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(val_y,y_pred),display_labels=val_y.Class.unique()).plot()

## Build a Decision Tree Classifier model with Scikit-Learn

In [None]:
#compute the sample weights to be used as input  to the train routine so that it takes into account the class imbalance present in this dataset
w_train=compute_sample_weight('balanced',y_train)
w_train

In [None]:
# import the decision Tree Classifier model from scikit learn
from sklearn.tree import DecisionTreeClassifier
#for reproducible output across multiple function calls, set random state to a given value
DecisionTree=DecisionTreeClassifier(max_depth=4, random_state=1)
#train a decision tree classifier using scikit learn
t0=time.time() 
DecisionTree.fit(x_train,y_train,sample_weight=w_train)
sklearn_time=time.time()-t0
print('scikit learn training time(s):{0:.5f}'.format(sklearn_time))

In [None]:
plt.figure(figsize=(20,18),dpi=300)
from sklearn import tree
tree.plot_tree(DecisionTree,filled=True)
plt.show()

In [None]:
from sklearn import metrics

In [None]:
y_pred=DecisionTree.predict(x_test)
cf=metrics.confusion_matrix(y_test,y_pred,labels=DecisionTree.classes_,normalize='true')
cf

In [None]:
metrics.ConfusionMatrixDisplay(confusion_matrix=cf).plot()
plt.tight_layout()

In [None]:
print(metrics.classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average='weighted') 

In [None]:
from sklearn.metrics import jaccard_score
jaccard_score(y_test,y_pred, pos_label=0)

# Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC