In [26]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

In [27]:
df = pd.read_csv(r"C:\Users\hi\Desktop\Data Analytics\Data Sets\new\credit_card_fraud_10k.csv")
df.head()

Unnamed: 0,transaction_id,amount,transaction_hour,merchant_category,foreign_transaction,location_mismatch,device_trust_score,velocity_last_24h,cardholder_age,is_fraud
0,1,84.47,22,Electronics,0,0,66,3,40,0
1,2,541.82,3,Travel,1,0,87,1,64,0
2,3,237.01,17,Grocery,0,0,49,1,61,0
3,4,164.33,4,Grocery,0,1,72,3,34,0
4,5,30.53,15,Food,0,0,79,0,44,0


In [28]:
# label encoding merchant category
le = LabelEncoder()
df['merchant_category'] = le.fit_transform(df.merchant_category).astype('int64')

In [29]:
df.columns

Index(['transaction_id', 'amount', 'transaction_hour', 'merchant_category',
       'foreign_transaction', 'location_mismatch', 'device_trust_score',
       'velocity_last_24h', 'cardholder_age', 'is_fraud'],
      dtype='object')

In [30]:
#scalling numeric columns
num_col = ['transaction_id', 'amount', 'transaction_hour', 'merchant_category',
       'foreign_transaction', 'location_mismatch', 'device_trust_score',
       'velocity_last_24h', 'cardholder_age']

preprocessing = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),num_col)
    ]
)

In [31]:
# defining base model
base_model = [
    ('lr', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier(random_state=42,max_depth=10,n_estimators=200)),
    ('xgb', XGBClassifier(random_state=42,eval_metric='logloss'))
]

In [32]:
# creating a meta model
meta_model = LogisticRegression(max_iter=1000)

In [33]:
# creating a stacking classifier
stacking_model = StackingClassifier(estimators=base_model, final_estimator=meta_model, cv=5, n_jobs=-1)

In [34]:
# spliting data
from sklearn.model_selection import train_test_split
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [35]:
#complete pipeline

stacking_pipeline = Pipeline(
    steps=[
        ('preprocessing', preprocessing),
        ('smote',SMOTE(random_state=42)),
        ('classifier',stacking_model)
    ]
)

In [36]:
# training model
stacking_pipeline.fit(X_train, y_train)

In [37]:
#model prediction
y_pred = stacking_pipeline.predict(X_test)

In [38]:
#model evaluation
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(y_test, y_pred))
print('ROC-AUC:', roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1977
           1       0.88      0.91      0.89        23

    accuracy                           1.00      2000
   macro avg       0.94      0.96      0.95      2000
weighted avg       1.00      1.00      1.00      2000

ROC-AUC: 0.9557630137890084
