<a href="https://colab.research.google.com/github/kraipisit/kraipisit/blob/main/fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fraud Detection, EDA, Modeling

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix,precision_score,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score,cross_val_predict
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, accuracy_score,f1_score
from warnings import simplefilter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV


# Fraud detection in online transactions¶


In [None]:
import os
for dirname, _, filenames in os.walk('/content/drive/MyDrive/fraud/Fraud.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/content/drive/MyDrive/fraud/Fraud.csv')

# Data Structure¶


In [None]:
df.shape

In [None]:
df.head()

# Data Cleaning¶**

Checking* for null values

Checking* for duplicates

Checking* on data types 




In [None]:
df = df.dropna()

In [None]:
df

In [None]:
df.drop_duplicates(['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'], inplace=True)
# Reset dataframe index after drop_duplicates.
df.reset_index(drop=True, inplace=True)
len(df)


In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
len (df[ df.duplicated(['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest'], keep = False) ])

# Initial Analysis and visualization
Studying Correlation:

In [None]:
cor = df.corr()
cor

In [None]:
plt.figure(figsize = (16,8))
ax=sns.heatmap(df.corr(method ='pearson'),annot=True,vmin=-1, vmax=1)

# Time Stamp¶


In [None]:
df['day_of_month'] =np.ceil(df['step']/24)

# types of transactions¶


In [None]:
sns.countplot(x='type',data=df,order=df['type'].value_counts().index,color = 'blue')

# types of transactions where fraud occurs¶

ประเภทของธุรกรรมที่เกิดการฉ้อโกง




In [None]:
df[df['isFraud']==1].groupby('type')['isFraud'].count()

# Fradulant transactions only occurs in cash out and transfer transactions

 ธุรกรรมฉ้อโกงเกิดขึ้นเฉพาะในการทำธุรกรรมเงินสดออกและโอน


In [None]:
ct = df[df['type'].isin(['CASH_OUT','TRANSFER'])]

In [None]:
fraud_in_cash = ct[ct['type'] == 'CASH_OUT']
fraud_in_cash[fraud_in_cash['isFraud'] == 1]

# In fraudulant transactions of type cash out, it's noticed that the origin balance goes back to zero after the cash out


ในการทำธุรกรรมที่เป็นการฉ้อโกงประเภทการถอนเงินสด จะสังเกตเห็นว่ายอดดุลต้นทางกลับเป็นศูนย์หลังจากถอนเงินสดออก

In [None]:
fraud_in_transfer = ct[ct['type'] == 'TRANSFER']
fraud_in_transfer[fraud_in_transfer['isFraud'] == 1]

in Fraudulant transaction of type transfer it's noticed that the whole amount in the account of origin is emptied, with no access on Destination balance

ในธุรกรรมฉ้อโกงประเภทการโอน สังเกตว่า ยอดเงินทั้งหมดในบัญชีต้นทางว่างเปล่า เข้าถึงยอดคงเหลือปลายทางไม่ได้






In [None]:
sns.scatterplot(data= ct , x = 'day_of_month', y='amount',hue = 'type')

# visualizing Data Distribution¶


In [None]:
bin_values = np.arange(start=0, stop=1000000, step=1000)
g = sns.histplot(df['amount'],bins=bin_values)

In [None]:
bin_values = np.arange(start=0, stop=4000000, step=200000)
fig, axes = plt.subplots(1, 4,figsize=(15,5),sharey=True)
fig.suptitle('Studying data distribution in normal transactions')
sns.histplot(ax = axes[0],x='oldbalanceOrg',bins=bin_values,data=df[df['isFraud']==0],color = 'green')
sns.histplot(ax = axes[1],x='newbalanceOrig',bins=bin_values,data=df[df['isFraud']==0],color = 'green')
sns.histplot(ax = axes[2],x='oldbalanceDest',bins=bin_values,data=df[df['isFraud']==0],color = 'green')
sns.histplot(ax = axes[3],x='newbalanceDest',bins=bin_values,data=df[df['isFraud']==0],color = 'green')

In [None]:
in_values = np.arange(start=0, stop=4000000, step=200000)
fig, axes = plt.subplots(1, 4,figsize=(15,5),sharey=True)
fig.suptitle('Studying data distribution in fraudelant transactions')
sns.histplot(ax = axes[0],x='oldbalanceOrg',bins=bin_values,data=df[df['isFraud']==1],color = 'red')
sns.histplot(ax = axes[1],x='newbalanceOrig',bins=bin_values,data=df[df['isFraud']==1],color = 'red')
sns.histplot(ax = axes[2],x='oldbalanceDest',bins=bin_values,data=df[df['isFraud']==1],color = 'red')
sns.histplot(ax = axes[3],x='newbalanceDest',bins=bin_values,data=df[df['isFraud']==1],color = 'red')

# Data Distribution is right Skewed, peek at zero

การกระจายข้อมูลถูกต้อง เบ้ ดูที่ศูนย์

In [None]:
a = sns.countplot(x= df['isFraud'])
a.set(yscale="log")

Due to the problem Nature, fraudulant transactions are very little amount campared to total transactions


เนื่องจากปัญหาของธรรมชาติ ธุรกรรมที่เป็นการฉ้อโกงจึงมีการกำหนดจำนวนน้อยมากในการทำธุรกรรมทั้งหมด


# Categorical variables study

#การศึกษาตัวแปรเชิงหมวดหมู่


In [None]:
df[['type', 'nameOrig', 'nameDest','isFlaggedFraud']]

# Extracting 2 new columns by the digit in the start of the name columns

แยกคอลัมน์ใหม่ 2 คอลัมน์ด้วยตัวเลขในตอนต้นของคอลัมน์ชื่อ





In [None]:
df['nameOrig_code'] = df['nameOrig'].str[:1]
df['nameDest_code'] = df['nameDest'].str[:1]

In [None]:
df[['nameDest_code','nameOrig_code']].value_counts()

we can drop nameOrig_code as it is static wouldn't be useful in or analysis



เราสามารถวาง nameOrig_code ได้เนื่องจากเป็นแบบคงที่จะไม่มีประโยชน์ในหรือการวิเคราะห์

Numerical variables study:

การศึกษาตัวแปรเชิงตัวเลข






Studying Outliers:

การศึกษาค่าผิดปกติ

In [None]:
num_cols = ['step','oldbalanceOrg', 'newbalanceOrig',
        'oldbalanceDest', 'newbalanceDest','amount']
for col in num_cols:
    Q1,Q3 = np.percentile(df[col], [25,75])
    IQR = Q3 - Q1
    ul = Q3+1.5*IQR
    ll = Q1-1.5*IQR
    outliers = df[col][(df[col] > ul) | (df[col] < ll)]
    print(f'{col}:{len(outliers)}')

In [None]:
df.shape

# from the previous Analysis we conclude the need to :


under-sample our data, due to the imbalance


Scale the data due to the outliers




# Under Sampling



จากการวิเคราะห์ครั้งก่อน เราสรุปความจำเป็นในการ :

ข้อมูลของเราต่ำกว่าตัวอย่างเนื่องจากความไม่สมดุล

ปรับขนาดข้อมูลเนื่องจากค่าผิดปกติ

ภายใต้การสุ่มตัวอย่าง



In [None]:
def underSampling(df,target):
    down = min(df[target].value_counts())
    majority_index = df[df[target] == 0].index
    minority_index = df[df[target] == 1].index
    random_major_index= np.random.choice(majority_index,down,replace=False)
    under_sample_index = np.concatenate([minority_index,random_major_index])
    under_sample = df.loc[under_sample_index]
    return under_sample

In [None]:
df = underSampling(df,'isFraud')

# Data preprocessing for modeling:

Scaling numerical variables

Encoding categorical columns

การประมวลผลข้อมูลล่วงหน้าสำหรับการสร้างแบบจำลอง

การปรับขนาดตัวแปรตัวเลข

การเข้ารหัสคอลัมน์หมวดหมู่

In [None]:
num_cols = ['step','oldbalanceOrg','oldbalanceDest','amount']
cat_cols = ['type', 'nameDest_code','isFlaggedFraud']

In [None]:
def train_test (df,target):
    X = df.drop(target,axis = 1)
    X = X.iloc[:,:]
    y = df[target]   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = train_test(df,'isFraud')

In [None]:
full_pipeline = ColumnTransformer([
 ("num", StandardScaler(), num_cols),
 ("cat", OneHotEncoder(), cat_cols),
 ])

In [None]:
X_train_cleaned= full_pipeline.fit_transform(X_train)
X_test_cleaned =full_pipeline.transform(X_test)

# Modeling
# Performance measures:

การสร้างแบบจำลอง

มาตรการประสิทธิภาพ

In [None]:
def printScores(y_test, y_pred):
    print('Confusion matrix:\n',confusion_matrix(y_test, y_pred))
    print('accuracy_score',accuracy_score(y_test, y_pred))
    print('Recall_score',recall_score(y_test, y_pred))
    print('precision_score',precision_score(y_test, y_pred))
    print('f1_score',f1_score(y_test,y_pred))    

In [None]:
knn_clf = KNeighborsClassifier()
y_train_pred = cross_val_predict(knn_clf,X_train_cleaned,y_train, cv = 10)

In [None]:
printScores(y_train,y_train_pred)

# Grid Search on KNN

Due to the problem nature, we cannot accept high false negative rate



ค้นหากริดบน KNN

เนื่องจากลักษณะของปัญหาเราไม่สามารถยอมรับอัตราการติดลบเท็จที่สูงได้


In [None]:
param_grid ={'n_neighbors': range(1,30),
            'weights':['uniform','distance'],
              'algorithm':['auto','ball_tree','kd_tree'],
              'leaf_size':[1,2,3,5],
              'metric':['minkowski'],
              'p': [1,2]
              }

In [None]:
grid = GridSearchCV(knn_clf, param_grid= param_grid, cv = 10, scoring = ['precision', 'recall'],refit='recall')
grid.fit(X_train_cleaned, y_train)

In [None]:
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree'],
                         'leaf_size': [1, 2, 3, 5], 'metric': ['minkowski'],
                         'n_neighbors': range(1, 30), 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             refit='recall', scoring=['precision', 'recall'])

In [None]:
grid.best_estimator_

KNeighborsClassifier(leaf_size=1, n_neighbors=1, p=1)



In [None]:
knn_pred = grid.predict(X_test_cleaned)

In [None]:
printScores(y_test,knn_pred)

# Apply decision trees for better recal:


ใช้แผนผังการตัดสินใจเพื่อการจำที่ดีขึ้น
# Decision trees:

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf_en = DecisionTreeClassifier(criterion='gini', max_depth=9, random_state=0)
tree_pred = cross_val_predict(clf_en,X_train_cleaned,y_train,cv = 10)

In [None]:
printScores(y_train,tree_pred)

# **Grid search on decision trees:**

การค้นหาตารางบนแผนผังการตัดสินใจ

In [None]:
params = {
'max_depth':[5,10,20],
'min_samples_split':[2,3,5,10],
'min_samples_leaf':range(10,20),
 'min_impurity_decrease': [0.0001, 0.0005, 0.001, 0.005, 0.01]
}

In [None]:
c = DecisionTreeClassifier()
clf = GridSearchCV(c,param_grid= params,scoring ='recall')

In [None]:
clf.fit(X_train_cleaned, y_train)

In [None]:
clf.best_estimator_

In [None]:
tree_pred = clf.predict(X_test_cleaned)
printScores(y_test, tree_pred)

# Decision trees gave a better false negative ratio¶

 ต้นไม้แห่งการตัดสินใจให้อัตราส่วนลบเท็จที่ดีกว่า