# Detect financial fraud with stochastic vector machine (SVM) in python
10-detect-financial-fraud-w-svm-in-python

In [74]:
# Setup libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [55]:
# Read data into dataframe
df = pd.read_csv('../00-Datasets/finance1.csv')

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [57]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [58]:
df.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [59]:
df.sample(6)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6211480,588,PAYMENT,37616.01,C389403521,131055.52,93439.51,M1563722516,0.0,0.0,0,0
1485056,141,PAYMENT,846.06,C1615784271,0.0,0.0,M1746077381,0.0,0.0,0,0
6001317,426,PAYMENT,12678.59,C1516082905,139429.83,126751.24,M241111218,0.0,0.0,0,0
6080721,520,CASH_IN,113818.97,C1882913163,259309.37,373128.34,C754407098,266396.36,152577.39,0,0
3081123,235,CASH_OUT,359784.92,C550338076,0.0,0.0,C1113798570,544431.5,904216.42,0,0
3870179,283,PAYMENT,1397.5,C717449881,24604.0,23206.5,M7101009,0.0,0.0,0,0


In [60]:
# Explore categorical features
df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [61]:
# Check for missing values
df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [62]:
# Check for duplicate values
df.duplicated().sum()

0

In [63]:
# Prepare the data
df = df.sample(10000, random_state=0)
le = LabelEncoder()
df.type = le.fit_transform(df.type)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 4644207 to 3422619
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            10000 non-null  int64  
 1   type            10000 non-null  int32  
 2   amount          10000 non-null  float64
 3   nameOrig        10000 non-null  object 
 4   oldbalanceOrg   10000 non-null  float64
 5   newbalanceOrig  10000 non-null  float64
 6   nameDest        10000 non-null  object 
 7   oldbalanceDest  10000 non-null  float64
 8   newbalanceDest  10000 non-null  float64
 9   isFraud         10000 non-null  int64  
 10  isFlaggedFraud  10000 non-null  int64  
dtypes: float64(5), int32(1), int64(3), object(2)
memory usage: 898.4+ KB


In [65]:
df.drop(columns=['nameOrig','nameDest'], axis=1, inplace=True)

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 4644207 to 3422619
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            10000 non-null  int64  
 1   type            10000 non-null  int32  
 2   amount          10000 non-null  float64
 3   oldbalanceOrg   10000 non-null  float64
 4   newbalanceOrig  10000 non-null  float64
 5   oldbalanceDest  10000 non-null  float64
 6   newbalanceDest  10000 non-null  float64
 7   isFraud         10000 non-null  int64  
 8   isFlaggedFraud  10000 non-null  int64  
dtypes: float64(5), int32(1), int64(3)
memory usage: 742.2 KB


In [67]:
# Split dataset into 80% train and 20% test
X = df.drop('isFraud', axis=1)
y = df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [68]:
# Build model
model = SVC()
model.fit(X_train, y_train)

In [70]:
# Feed test data into model
y_predict = model.predict(X_test)

actual_vs_predict = pd.DataFrame({'Actual': y_test,
                                 'Predict': y_predict})

actual_vs_predict.sample(12)

Unnamed: 0,Actual,Predict
5402607,0,0
2220927,0,0
1187085,0,0
5526543,0,0
5634550,0,0
3907901,0,0
1615086,0,0
2042538,0,0
5680292,0,0
1503425,0,0


In [75]:
# Evaluate model
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1997
           1       1.00      0.33      0.50         3

    accuracy                           1.00      2000
   macro avg       1.00      0.67      0.75      2000
weighted avg       1.00      1.00      1.00      2000



## Notes
* We wanted to detect fraudulent transactions using SVM.
* Our dataset had 6362620 records and 11 features, 5 of which are of data type float64, 3 int, and 3 objects.
* A quick look showed that there were no missing or duplicate values.
* We removed some features and transformed the rest. We also picked a sample size of 10000 to use in our model. 
* The dataset was split into 80% train and 20% test sets. No validation set was created or used. No hyperparameter tuning occurred.
* We built, trained, and tested an SVM. 
* We evaluated our model using ground truth and several metrics including precision, recall, and f1-score.