# Import necessary packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import shap
import lime
import lime.lime_tabular

  from .autonotebook import tqdm as notebook_tqdm


# Load Data

In [2]:
processed_fraud_data = pd.read_csv('../data/processed/processed_fraud_data.csv')
creditcard_data = pd.read_csv('../data/raw/creditcard.csv')


In [3]:
# Convert datetime strings to datetime objects
processed_fraud_data['signup_time'] = pd.to_datetime(processed_fraud_data['signup_time'])
processed_fraud_data['purchase_time'] = pd.to_datetime(processed_fraud_data['purchase_time'])

# Extract useful datetime components
processed_fraud_data['signup_hour'] = processed_fraud_data['signup_time'].dt.hour
processed_fraud_data['signup_day'] = processed_fraud_data['signup_time'].dt.dayofweek
processed_fraud_data['purchase_hour'] = processed_fraud_data['purchase_time'].dt.hour
processed_fraud_data['purchase_day'] = processed_fraud_data['purchase_time'].dt.dayofweek

In [4]:
# For fraud_data
fraud_X = processed_fraud_data.drop(columns=['class'])
fraud_y = processed_fraud_data['class']

# Split fraud_data
fraud_X_train, fraud_X_test, fraud_y_train, fraud_y_test = train_test_split(fraud_X, fraud_y, test_size=0.3, random_state=42)

In [5]:
# Define preprocessing for numerical and categorical features
numeric_features = ['purchase_value', 'age']  # Example numeric features
categorical_features = ['source', 'browser', 'sex', 'signup_hour', 'signup_day', 'purchase_hour', 'purchase_day']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [None]:
# Use the best model (Random Forest in this case) and pipeline for fraud data
rf_pipeline_fraud = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier())])
rf_pipeline_fraud.fit(fraud_X_train, fraud_y_train)