#### Importing dataset and checking all columns

In [19]:
import pandas as pd
import numpy as np
fraudData = pd.read_csv('PS_20174392719_1491204439457_log.csv')
print(fraudData.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


#### Check for null values

In [20]:
print(fraudData.isnull().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


#### Analyzing different transaction types: Given in type column

In [21]:
print(fraudData.type.value_counts())

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64


#### Plot the Transaction Details

In [22]:
transactions = fraudData.type.value_counts()
transactionTypes = transactions.index
transactionTotals = transactions.values
print(transactionTypes)
print(transactionTotals)

Index(['CASH_OUT', 'PAYMENT', 'CASH_IN', 'TRANSFER', 'DEBIT'], dtype='object')
[2237500 2151495 1399284  532909   41432]


In [23]:
import plotly.express as ply
ply.pie(fraudData,values = transactionTotals,names = transactionTypes,title = 'Distribution of different Transaction types' )

#### Finding the correlation between the column "isFraud" and other columns

In [24]:
correlation = fraudData.corr()
print(correlation.isFraud.sort_values(ascending=False))

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


#### Mapping cells as Fraud and No Fraud to 1 and 0 respectively

In [25]:
fraudData.type = fraudData.type.map({"CASH_OUT": 1, "PAYMENT": 2, 
                                 "CASH_IN": 3, "TRANSFER": 4,
                                 "DEBIT": 5})
fraudData.isFraud = fraudData.isFraud.map({0: "No Fraud", 1: "Fraud"})
print(fraudData.head())

   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     2   9839.64  C1231006815       170136.0       160296.36   
1     1     2   1864.28  C1666544295        21249.0        19384.72   
2     1     4    181.00  C1305486145          181.0            0.00   
3     1     1    181.00   C840083671          181.0            0.00   
4     1     2  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest   isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0  No Fraud               0  
1  M2044282225             0.0             0.0  No Fraud               0  
2   C553264065             0.0             0.0     Fraud               0  
3    C38997010         21182.0             0.0     Fraud               0  
4  M1230701703             0.0             0.0  No Fraud               0  


#### X - Independent features excluding Target Variable , y - Dependant variable (Target Variable)

In [29]:
from sklearn.model_selection import train_test_split
x = np.array(fraudData[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(fraudData[["isFraud"]])

#### Decision Tree Classifier

In [30]:
from sklearn.tree import DecisionTreeClassifier
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))

0.9997406728674666


#### Predicting whether Fraud or not

In [31]:
#features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[4, 9000.60, 9000.60, 0.0]])
print(model.predict(features))

['Fraud']
