In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Octave - John Keells Codes/Use Cases Test/Q4_Data.csv')

In [3]:
df.head()

Unnamed: 0,Transaction_ID,Transaction_Amount,Transaction_Time,Location,Merchant_Category,Cardholder_Age,Previous_Fraud_Flag,Is_Fraud
0,TX001,1050,2023-03-01 08:15:23,Colombo,Groceries,34,0,0
1,TX002,1187,2023-03-01 09:02:11,Kandy,Electronics,42,0,0
2,TX003,1572,2023-03-01 10:45:05,Galle,Clothing,29,0,0
3,TX004,2435,2023-03-01 11:30:47,Jaffna,Fuel,55,0,0
4,TX005,3150,2023-03-01 12:20:33,Matara,Restaurant,38,0,0


In [4]:
df.describe()

Unnamed: 0,Transaction_Amount,Cardholder_Age,Previous_Fraud_Flag,Is_Fraud
count,200.0,200.0,200.0,200.0
mean,1997.895,41.305,0.035,0.025
std,985.939128,9.617926,0.184241,0.156517
min,980.0,22.0,0.0,0.0
25%,1208.75,34.0,0.0,0.0
50%,1576.0,41.0,0.0,0.0
75%,2620.0,48.0,0.0,0.0
max,4260.0,63.0,1.0,1.0


In [5]:
df['Is_Fraud'].value_counts()

Unnamed: 0_level_0,count
Is_Fraud,Unnamed: 1_level_1
0,195
1,5


In [6]:
df.describe()

Unnamed: 0,Transaction_Amount,Cardholder_Age,Previous_Fraud_Flag,Is_Fraud
count,200.0,200.0,200.0,200.0
mean,1997.895,41.305,0.035,0.025
std,985.939128,9.617926,0.184241,0.156517
min,980.0,22.0,0.0,0.0
25%,1208.75,34.0,0.0,0.0
50%,1576.0,41.0,0.0,0.0
75%,2620.0,48.0,0.0,0.0
max,4260.0,63.0,1.0,1.0


In [7]:
df.isnull().sum()

Unnamed: 0,0
Transaction_ID,0
Transaction_Amount,0
Transaction_Time,0
Location,0
Merchant_Category,0
Cardholder_Age,0
Previous_Fraud_Flag,0
Is_Fraud,0


In [8]:
df.duplicated().sum()

0

In [13]:
df.dtypes

Unnamed: 0,0
Transaction_ID,object
Transaction_Amount,int64
Transaction_Time,datetime64[ns]
Location,object
Merchant_Category,object
Cardholder_Age,int64
Previous_Fraud_Flag,int64
Is_Fraud,int64


In [14]:
df['Transaction_Time'] = pd.to_datetime(df['Transaction_Time'])
df['Hour'] = df['Transaction_Time'].dt.hour
df['Day'] = df['Transaction_Time'].dt.day_of_week

In [15]:
df.head()

Unnamed: 0,Transaction_ID,Transaction_Amount,Transaction_Time,Location,Merchant_Category,Cardholder_Age,Previous_Fraud_Flag,Is_Fraud,Hour,Day
0,TX001,1050,2023-03-01 08:15:23,Colombo,Groceries,34,0,0,8,2
1,TX002,1187,2023-03-01 09:02:11,Kandy,Electronics,42,0,0,9,2
2,TX003,1572,2023-03-01 10:45:05,Galle,Clothing,29,0,0,10,2
3,TX004,2435,2023-03-01 11:30:47,Jaffna,Fuel,55,0,0,11,2
4,TX005,3150,2023-03-01 12:20:33,Matara,Restaurant,38,0,0,12,2


In [16]:
q1 = np.percentile(df['Transaction_Amount'], 25)
q3 = np.percentile(df['Transaction_Amount'], 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

df = df[(df['Transaction_Amount'] >= lower_bound) & (df['Transaction_Amount'] <= upper_bound)]

In [18]:
df.shape

(200, 10)

In [19]:
df.drop(['Transaction_Time','Transaction_ID'], axis=1, inplace=True)

In [20]:
df.head()

Unnamed: 0,Transaction_Amount,Location,Merchant_Category,Cardholder_Age,Previous_Fraud_Flag,Is_Fraud,Hour,Day
0,1050,Colombo,Groceries,34,0,0,8,2
1,1187,Kandy,Electronics,42,0,0,9,2
2,1572,Galle,Clothing,29,0,0,10,2
3,2435,Jaffna,Fuel,55,0,0,11,2
4,3150,Matara,Restaurant,38,0,0,12,2


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

X = df.drop('Is_Fraud', axis=1)
y = df['Is_Fraud']

numeric_col = X.select_dtypes(include=['number']).columns
categorical_col = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_col),
        ('cat', OneHotEncoder(), categorical_col)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = ImbPipeline(steps=[('preprocessor', preprocessor),
                        ('smote', SMOTE(sampling_strategy=0.6, random_state=42, k_neighbors=2)),
                        ('classifier', SVC(probability=True))])

param_grid = {'classifier__C':[1,10,100,1000],'classifier__gamma':[1,0.1,0.001,0.0001], 'classifier__kernel':['linear','rbf']}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(model, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train, y_train)

print(f"Best Parameters: {grid.best_params_}")
print(f"Best Score: {grid.best_score_}")

y_pred = grid.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))
