In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Fraud.csv')

In [4]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

#### No null values in the dataset

In [5]:
df['nameOrigtype'] = [x[0] for x in df.nameOrig]

In [6]:
df['nameDesttype'] = [x[0] for x in df.nameDest]

In [7]:
df = df.drop(columns=['nameOrig', 'nameDest'])

In [8]:
df1 = pd.get_dummies(df, drop_first=True)

In [9]:
X = df1.drop(columns='isFraud')
vif = pd.DataFrame()
vif["Feature"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print("VIF:")
print(vif)

  vif = 1. / (1. - r_squared_i)


VIF:
           Feature         VIF
0             step    3.008689
1           amount    4.215690
2    oldbalanceOrg  502.777063
3   newbalanceOrig  508.201916
4   oldbalanceDest   72.398423
5   newbalanceDest   84.055073
6   isFlaggedFraud    1.000571
7    type_CASH_OUT    1.870071
8       type_DEBIT    1.016118
9     type_PAYMENT         inf
10   type_TRANSFER    1.406326
11  nameDesttype_M         inf


#### oldbalanceOrg, oldbalanceDest, newbalanceDest and newbalanceOrig have relatively high VIF values, suggesting that they might be correlated with each other which they are as change in one changes the other.So, removing them would be useless as balance of both sides can be a major factor in checking fraud theoretically.
#### type_PAYMENT, type_TRANSFER, and nameDesttype_M have extremely high VIF values (inf). This suggests that these variables are highly correlated with other variables in the dataset and may indicate multicollinearity issues but we will be also check correlation because these are categorical variables which have been converted so their high vif may have other reasons.

In [10]:
df1.corr().isFraud

step              0.031578
amount            0.076688
oldbalanceOrg     0.010154
newbalanceOrig   -0.008148
oldbalanceDest   -0.005885
newbalanceDest    0.000535
isFraud           1.000000
isFlaggedFraud    0.044109
type_CASH_OUT     0.011256
type_DEBIT       -0.002911
type_PAYMENT     -0.025697
type_TRANSFER     0.053869
nameDesttype_M   -0.025697
Name: isFraud, dtype: float64

#### type_TRANSFER has a moderate positive correlation with isFraud, indicating that transactions of type "TRANSFER" are more likely to be fraudulent.
#### amount has a moderate positive correlation with isFraud, suggesting that larger transactions are slightly more likely to be fraudulent.
#### type_PAYMENT and nameDesttype_M have weak negative correlations with isFraud, indicating that transactions of type "PAYMENT" and destination type "M" are slightly less likely to be fraudulent.

In [11]:
df1['DestBalance_Change'] = df1['newbalanceDest']-df1['oldbalanceDest']
df1.drop(columns=['oldbalanceOrg','oldbalanceDest','newbalanceDest'],inplace=True)

In [12]:
sc = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)
data =['amount','DestBalance_Change','newbalanceOrig']
df1[data] = sc.fit_transform(df1[data])
df1

Unnamed: 0,step,amount,newbalanceOrig,isFraud,isFlaggedFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,nameDesttype_M,DestBalance_Change
0,1,-0.332932,1.111175,0,0,0,0,1,0,1,0.000000
1,1,-0.373762,0.134375,0,0,0,0,1,0,1,0.000000
2,1,-0.382380,0.000000,1,0,0,0,0,1,0,0.000000
3,1,-0.382380,0.000000,1,0,1,0,0,0,0,-0.142061
4,1,-0.323571,0.207169,0,0,0,0,1,0,1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,1.355693,0.000000,1,0,1,0,0,0,0,2.278134
6362616,743,31.927899,0.000000,1,0,0,0,0,1,0,0.000000
6362617,743,31.927899,0.000000,1,0,1,0,0,0,0,42.328502
6362618,743,3.968274,0.000000,1,0,0,0,0,1,0,0.000000


## Using Robust scaler serves us multiple purposes by standardising our data and also makes the data less sensitive to outliers so we dont have to use the z score technique

In [13]:
df1['Fraud'] = df1['isFraud']
df1.drop(columns=['isFraud'], inplace=True)

In [14]:
X = df1.drop(columns='Fraud')
vif = pd.DataFrame()
vif["Feature"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print("VIF:")
print(vif)

  vif = 1. / (1. - r_squared_i)


VIF:
              Feature       VIF
0                step  2.779888
1              amount  3.982403
2      newbalanceOrig  1.155875
3      isFlaggedFraud  1.000566
4       type_CASH_OUT  1.750768
5          type_DEBIT  1.015123
6        type_PAYMENT       inf
7       type_TRANSFER  1.349593
8      nameDesttype_M       inf
9  DestBalance_Change  3.843211


#### The new column and removal of unuseful columns has drastically reduced the multicollinearity of the variables.

In [15]:
x = df1.drop(columns='Fraud')
y = df1.Fraud
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,random_state=42)

In [16]:
model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    max_depth=None,
    eta=0.3,
    n_estimators=1000,
)

model.fit(
    x_train, y_train,
    eval_set=[(x_test, y_test)],
    early_stopping_rounds=10,
    verbose=True
)



[0]	validation_0-auc:0.92090
[1]	validation_0-auc:0.92148
[2]	validation_0-auc:0.96670
[3]	validation_0-auc:0.96687
[4]	validation_0-auc:0.97028
[5]	validation_0-auc:0.97121
[6]	validation_0-auc:0.97149
[7]	validation_0-auc:0.97461
[8]	validation_0-auc:0.97468
[9]	validation_0-auc:0.97468
[10]	validation_0-auc:0.97472
[11]	validation_0-auc:0.97579
[12]	validation_0-auc:0.97523
[13]	validation_0-auc:0.97547
[14]	validation_0-auc:0.97558
[15]	validation_0-auc:0.97713
[16]	validation_0-auc:0.97720
[17]	validation_0-auc:0.97937
[18]	validation_0-auc:0.97943
[19]	validation_0-auc:0.97951
[20]	validation_0-auc:0.97953
[21]	validation_0-auc:0.97970
[22]	validation_0-auc:0.98027
[23]	validation_0-auc:0.98045
[24]	validation_0-auc:0.98083
[25]	validation_0-auc:0.98155
[26]	validation_0-auc:0.98165
[27]	validation_0-auc:0.98272
[28]	validation_0-auc:0.98266
[29]	validation_0-auc:0.98271
[30]	validation_0-auc:0.98281
[31]	validation_0-auc:0.98293
[32]	validation_0-auc:0.98293
[33]	validation_0-au

In [17]:
y_preds = model.predict(x_test)

In [18]:
accuracy = accuracy_score(y_test, y_preds)
precision = precision_score(y_test, y_preds)
recall = recall_score(y_test, y_preds)
f1 = f1_score(y_test, y_preds)

metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Score': [accuracy, precision, recall, f1]
})

print(metrics_df)

      Metric     Score
0   Accuracy  0.999618
1  Precision  0.974874
2     Recall  0.718519
3   F1 Score  0.827292


# EXPLANATIONS

## Data cleaning including missing values, outliers, and multicollinearity:

   ##### Missing values: No missing values have been found in the dataset. 
   ##### Outliers: The RobustScaler has been used to make the data less sensitive to outliers. Columns that might have multicollinearity have been identified but manipulated based on their theoretical importance.

## Fraud detection model in elaboration:

   ##### The fraud detection model used is an XGBoost classifier, a powerful ensemble machine learning algorithm. It has been trained as a binary classifier to distinguish between fraudulent (1) and non-fraudulent (0) transactions. The evaluation metric used is AUC (Area Under the ROC Curve) to assess its ability to distinguish between positive and negative cases. Early stopping has been implemented during training to prevent overfitting.

## How did you select variables to be included in the model?:

   1. Variables were selected based on their relevance to the fraud detection task. Transaction-related features like 'step' and 'amount' were included on the basis of correlation.
   2. The 'type' of transaction was included as it can be a significant indicator.
   3. Other engineered features like 'DestBalance_Change' were also included to capture potential patterns in the data.
   4. Columns like 'nameOrig' and 'nameDest' were dropped as they didn't directly contribute to the classification task. Instead their types were taken and it was found that if nameDest was 'M' type then the transaction was less likely to be a fraudulent one.

## Demonstrate the performance of the model by using the best set of tools:

   ##### The model's performance has been assessed using various metrics, including accuracy, precision, recall, and F1 score.Based on these metrics, the model appears to have high accuracy, precision, and a reasonable F1 score, which suggests its effectiveness in identifying fraudulent transactions.

## What are the key factors that predict fraudulent customer?:

   #### Based on the model and feature analysis:
   #### Transactions of type 'TRANSFER' and 'PAYMENT' are less likely to be fraudulent. Larger transaction amounts are more likely to be associated with fraudulent activity. The change in destination balance ('DestBalance_Change') is also an important feature in predicting fraud.

## Do these factors make sense? If yes, How? If not, How not?:

   #### Yes:
   1. Fraudulent transactions often involve larger amounts to maximize the gain.
   2. Transaction types like 'TRANSFER' and 'PAYMENT' might be less susceptible to fraud due to their nature.
   3. The change in destination balance can be indicative of unusual transactions, potentially linked to fraudulent activity.

## What kind of prevention should be adopted while the company updates its infrastructure?:

   #### Prevention measures should include:
1. Continuous monitoring and analysis of transaction data for anomalies.
2. Implementing transaction limits and flagging suspicious transactions for manual review.
3. Enhancing authentication and verification processes for high-risk transactions.
4. Regularly updating and improving fraud detection models to adapt to new fraud patterns.
5. Educating customers and employees about fraud risks and prevention.

## Assuming these actions have been implemented, how would you determine if they work?:

   #### The effectiveness of prevention measures can be assessed by monitoring key performance metrics:
1. Reduction in the number of confirmed fraudulent transactions.
2. Reduction in false positives.
3. Monitoring the model's performance metrics over time.




