In [1]:
#importing the necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import itertools
from collections import Counter
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

Pre-processing

In [2]:
#Reading the dataset and peeking into the structure of the data
fraud=pd.read_csv("D:\download\Fraud.csv")
fraud.shape
fraud.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [None]:
#Getting head of the dataset
fraud.head(200)

In [None]:
#checking if any null values are present 
#null values can be replaced with mean for the row
fraud.isnull().values.any()

In [None]:

Leg=len(fraud[fraud.isFraud==0])
frud=len(fraud[fraud.isFraud==1])
Leg_per=(Leg/(Leg+frud))*100
Fra_per=(frud/(Leg+frud))*100
print("Number of Legit transactions: ", Leg)
print("Number of Fraud transactions: ", frud)
print("Percentage of Legit transactions: {:.4f} %".format(Leg_per))
print("Percentage of Fraud transactions: {:.4f} %".format(Fra_per))

These results demonstrate that the dataset is higly unbalanced with only 0.129% of the transactions being fraudulent therefore we can use decision tree or a metamodel of decision trees to average out the bias towards any class.

In [None]:
cr=fraud.drop(['type','nameOrig','nameDest'],axis=1)
corr=cr.corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr,annot=True)

Based on the co-relation Heatmap we drop can drop the columns that bear low co-relation to the attribute that we have to predict

In [None]:
plt.figure(figsize=(5,10))
labels = ["Legit", "Fraud"]
count_classes = fraud.value_counts(fraud['isFraud'], sort= True)
count_classes.plot(kind = "bar", rot = 0)
plt.title("Visualization of Labels")
plt.ylabel("Count")
plt.xticks(range(2), labels)
plt.show()

Data manupilation

In [None]:
#create a copy to keep the original dataset safe
df= fraud.copy()
df.head()


In [None]:
#attributes that are objects need to be label encoded therefore we check for them
objList = df.select_dtypes(include = "object").columns
print (objList)

In [None]:

Lab= LabelEncoder()

for fea in objList:
    df[fea]=Lab.fit_transform(df[fea].astype(str))
    
print (df.info())


In [None]:
df.describe()

In [None]:
#checking the VIF(variation inflation factor) score to check for collinearity and we will omit the highly correlated attributes
def calc_vif(df):

    
    vif = pd.DataFrame()
    vif["variables"] = df.columns
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]

    return(vif)

calc_vif(df)

In [None]:


df['Actual_amount_orig'] = df.apply(lambda x: x['oldbalanceOrg'] - x['newbalanceOrig'],axis=1)
df['Actual_amount_dest'] = df.apply(lambda x: x['oldbalanceDest'] - x['newbalanceDest'],axis=1)
df['TransactionPath'] = df.apply(lambda x: x['nameOrig'] + x['nameDest'],axis=1)




In [None]:
df = df.drop(['oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest','step','nameOrig','nameDest'],axis=1)

calc_vif(df)

In [None]:
corr=df.corr()

plt.figure(figsize=(10,6))
sns.heatmap(corr,annot=True)

                                                              1.1
MODEL generation

In [None]:
#scaling 
scaler = StandardScaler()
df["NormalizedAmount"] = scaler.fit_transform(df["amount"].values.reshape(-1, 1))
df.drop(["amount"], inplace= True, axis= 1)

Y = df["isFraud"]
X = df.drop(["isFraud"], axis= 1)

scaling the entire dataset could decrease the accuracy however the transaction amount has a very high range therefore we should scale this down

In [None]:
#split
(X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, test_size= 0.3, random_state= 42)

print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

In [None]:
#Training decision tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)

Y_pred_dt = decision_tree.predict(X_test)
decision_tree_score = decision_tree.score(X_test, Y_test) * 100

In [None]:
#Training random forest
random_forest = RandomForestClassifier(n_estimators= 100)
random_forest.fit(X_train, Y_train)

Y_pred_rf = random_forest.predict(X_test)
random_forest_score = random_forest.score(X_test, Y_test) * 100

Evaluation

In [None]:
#scores
print("Decision Tree Score: ", decision_tree_score)
print("Random Forest Score: ", random_forest_score)

Both the models show a good score however random forest shows a better score compared to decision tree

In [None]:
#Confusion matrix terms-Decision Trees


print("TP,FP,TN,FN - Decision Tree")
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred_dt).ravel()
print(f'True Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')



In [None]:
#Confusion Matrix terms-Random Forest 
print("TP,FP,TN,FN - Random Forest")
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred_rf).ravel()
print(f'True Positives: {tp}')
print(f'False Positives: {fp}')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')

TP(Decision Tree) ~ TP(Random Forest)- Both models are nearly equal for this
FP(Decision Tree) > FP(Random Forest) - Random Forest is better
TN(Decision Tree) < TN(Random Forest) - Random Forest is better here also
FN(Decision Tree) ~ FN(Random Forest)


Therefore Random forest performs better here also

In [None]:
#Confusion Matrix -DT
confusion_matrix_dt = confusion_matrix(Y_test, Y_pred_dt.round())
print("Confusion Matrix - Decision Tree")
print(confusion_matrix_dt,)

In [None]:
#Confusion Matrix-RF
confusion_matrix_rf = confusion_matrix(Y_test, Y_pred_rf.round())
print("Confusion Matrix - Random Forest")
print(confusion_matrix_rf)

In [None]:
#Report-DT
classification_report_dt = classification_report(Y_test, Y_pred_dt)
print("Classification Report - Decision Tree")
print(classification_report_dt)

In [None]:
#Report-RF
classification_report_rf = classification_report(Y_test, Y_pred_rf)
print("Classification Report - Random Forest")
print(classification_report_rf)

The good precision and F1 score random forest is better.

In [None]:
#Visualise the CM
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_dt)
disp.plot()
plt.title('Confusion Matrix - DT')
plt.show()



disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_rf)
disp.plot()
plt.title('Confusion Matrix - RF')
plt.show()

In [None]:
#AOC-ROC-DT


fpr, tpr, threshold = metrics.roc_curve(Y_test, Y_pred_dt)
roc_auc = metrics.auc(fpr, tpr)

plt.title('ROC - DT')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()



In [None]:
#AOC-ROC-RF

fpr, tpr, threshold = metrics.roc_curve(Y_test, Y_pred_rf)
roc_auc = metrics.auc(fpr, tpr)

plt.title('ROC - RF')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()





\\







-The  tehhe- T__-- -
-The Data had no null values we checked fir or that 
Most o. The algorithms we isuused decision treaea ss and rafor our prediction ndom forest are inherently insesnsitive toward outliers. We used both a correlation matrix and the VIF criteria to check for multicollinera-arity and removed the featurcombined these features to create a new feature more useful forour analysis._- The Model wehave used and would prefer to use is random forest a (I ominabination odf decision trees, wthe dataaset we have is higly unbalanced  random forest trains its cdecision trees takinfg e using a subset of the data comprising of equalnumber o f fraudulent and legittimate transactions , . Besiddes this rthey are robust and immune to  tpto noiseoutliers therefore we have used random forest I this mprejooject.- The variables we have selected to be included arethe ones which have a high correlation to the isFraud attribute . besuNBesides this t he attributes having high ccorelation to each other are dropped and repl this the attributes having high correlation with eacha tThe are replaced with a single attribute.

. Demonstrate the performance of the model by using best set of tools.
- The set of tools best demostrating the models performance would be a confusion matrix which is pasted below as we need a high precision scoresas to not let a single fraudulent transaction happen even if a few legitimate transactions are stopped.
  Classification Report - Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1906351
           1       0.96      0.70      0.81      2435

    accuracy                           1.00   1908786
   macro avg       0.98      0.85      0.91   1908786
weighted avg       1.00      1.00      1.00  - the The key factors for predicting fraudulent transactions in \based on the heatmap are 'Actual-_origin_amount' and other 'Amount' of transaction .O Other factors that would bmake sense but are nit ot included in the dataset arewould be secured request rportal, Aa Transsaaction history of requestor.- These factors make sense as the target of fraudulent transactions would be typically amouccounts woth ith a similar balcance , Iif the person id s nearly vabankrupt or a multi-millionair e most probably they would not be in the targteet list of the fraudsters. The amount makes sense as the is self explantory as an amount to o high would beeasil fall y fall into notivce and a very small amount would bnot be worth the effort fora fraud.
-Th Based on this data the ccompany should introduce a real time fraud detection system in their pipeline which flagsa any such transactions ansd delays them unti further haltsconfirmation .  Other than this the company could enswarn the user anytime a payment is being made to a vendor that is not secure or has a sketchy apayment history . A pipelin e could be made gild be made f A number of flags could be used base od on previous reported cases from any s and include an option to report such transactions(subsequently flagging the vendor) uch combination of theseunt and otherto secure the transactions.
- The best method to check if they work or not would be to intriooduce thseese methods into a subset of the pipleline unaccessible to the pubklic and run a numbberer of cthese transactions ofn the system m and check if the new pipeline is abele to single out the fraudulent transactions . This would ensure that further finetuning and checks can be made in thhee systmodel before relasing easing tit to the ouupublic.