In [1]:
#using conda env mlops_2
import mlflow
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

#also needed to conda install fsspec
# conda install -c conda-forge huggingface_hub



2025-03-21 16:34:15.752391: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## preprocessing

In [2]:
#text data
fraud = pd.read_csv("hf://datasets/amitkedia/Financial-Fraud-Dataset/Final_Dataset.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# Convert the "Fraud" column to binary
fraud["Fraud"] = fraud["Fraud"].map({"yes": 1, "no": 0})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(fraud["Fillings"], fraud["Fraud"], test_size=0.2, random_state=42)

In [5]:
fraud.head()

Unnamed: 0,Fillings,Fraud
0,nanitem 14 exhibits financial statements repor...,1
1,item 14 principal accounting fees services mat...,0
2,item 14 exhibits financial statements schedule...,1
3,item 14 exhibits financial statement schedules...,1
4,item 14 exhibits financial statement schedules...,0


In [7]:
# #might be faster than above embedding method:

# # Load the Universal Sentence Encoder model
# use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# # Generate embeddings for the "Fillings" column
# embeddings = use_model(fraud["Fillings"].tolist()).numpy()

# # Add embeddings to the DataFrame
# fraud["Embeddings"] = list(embeddings)

# # Check the result
# print(fraud["Embeddings"].head())

: 

In [None]:
#might be faster than above embedding method:

# Load the Universal Sentence Encoder model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Function to generate embeddings in batches
def generate_embeddings_in_batches(data, batch_size=32):
    embeddings = []
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        batch_embeddings = use_model(batch).numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

# Generate embeddings for the "Fillings" column in batches
embeddings = generate_embeddings_in_batches(fraud["Fillings"].tolist())

# Add embeddings to the DataFrame
fraud["Embeddings"] = list(embeddings)

# Check the result
print(fraud["Embeddings"].head())

In [None]:


# Define the file name for the embeddings
embeddings_file = 'embeddings.csv'

# Check if embeddings already exist
try:
    # Try to load existing embeddings from CSV
    embeddings_df = pd.read_csv(embeddings_file)
    print("Loaded embeddings from CSV.")
except FileNotFoundError:
    print(f"{embeddings_file} not found. Please generate embeddings first.")
    # Optionally, you can raise an error or handle it as needed
    raise

# If you want to add the loaded embeddings to the fraud DataFrame
# Assuming 'fraud' DataFrame already exists and has the same number of rows
fraud["Embeddings"] = embeddings_df.values.tolist()

# Check the result
print(fraud["Embeddings"].head())

In [None]:


# # Tokenize the sentences and convert them to a bag-of-words representation
# vectorizer = CountVectorizer(stop_words='english')

# # Fit the vectorizer only on the training data (no leakage here)
# X_train_bow = vectorizer.fit_transform(X_train)

# # Now transform the test data using the already fitted vectorizer (no leakage here either)
# X_test_bow = vectorizer.transform(X_test)

# # Convert to DataFrame for easier viewing
# X_train_bow_df = pd.DataFrame(X_train_bow.toarray(), columns=vectorizer.get_feature_names_out())
# print("Training Data BOW Representation:")
# print(X_train_bow_df)

# X_test_bow_df = pd.DataFrame(X_test_bow.toarray(), columns=vectorizer.get_feature_names_out())
# print("Test Data BOW Representation:")
# print(X_test_bow_df)


## model selection

In [None]:


mlflow.set_experiment("fraud_detection")
# Define models to train
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}

# Start MLflow tracking
mlflow.start_run()

for model_name, model in models.items():
    # Train the model
    model.fit(X_train_bow, y_train)
    
    # Evaluate the model
    accuracy = model.score(X_test_bow, y_test)
    
    # Log model and metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.sklearn.log_model(model, model_name)

    print(f"{model_name}: Accuracy = {accuracy:.4f}")

mlflow.end_run()

## tuning

In [None]:
# # hyper param tuning for xgboost with baysian optimization using hyperopt

# Define the objective function
def objective(params):
    # Create the model with the suggested hyperparameters
    model = xgb.XGBClassifier(
        max_depth=int(params['max_depth']),
        min_child_weight=int(params['min_child_weight']),
        gamma=params['gamma'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        learning_rate=params['learning_rate'],
        use_label_encoder=False,
        eval_metric='logloss'
    )

    # Fit the model
    model.fit(X_train_bow, y_train)

    # Predict and evaluate
    preds = model.predict(X_test_bow)
    accuracy = accuracy_score(y_test, preds)

    # Return the accuracy as the objective to minimize (negative because we want to maximize accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK}

# Define the hyperparameter space
space = {
    'max_depth': hp.randint('max_depth', 3, 10),  # Integer values from 3 to 9
    'min_child_weight': hp.randint('min_child_weight', 1, 6),  # Integer values from 1 to 5
    'gamma': hp.uniform('gamma', 0, 0.5),  # Continuous values from 0 to 0.5
    'subsample': hp.uniform('subsample', 0.5, 1.0),  # Continuous values from 0.5 to 1.0
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),  # Continuous values from 0.5 to 1.0
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)  # Continuous values from 0.01 to 0.2
}

# Create a Trials object to keep track of the results
trials = Trials()

# Run Hyperopt
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

print("Best Hyperparameters:", best)

In [None]:
# i like xgboost, tuning it to limit overfitting

import mlflow
import mlflow.sklearn
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Set experiment
mlflow.set_experiment("fraud_detection_xgboost")

# For XGBoost tuning - with stronger focus on regularization
xgb_param_grid = {
    'max_depth': [2, 3, 4],  # Reduced depth to prevent overfitting
    'learning_rate': [0.01, 0.05],  # Lower learning rates
    'n_estimators': [50, 75],  # Fewer estimators
    'subsample': [0.5, 0.7],  # More aggressive subsampling
    'colsample_bytree': [0.5, 0.7],  # More aggressive column sampling
    'reg_alpha': [0.5, 1, 5],  # L1 regularization
    'reg_lambda': [1, 5, 10],  # L2 regularization
    'min_child_weight': [3, 5]  # Require more observations per node
}

# Start tracking
mlflow.start_run(run_name="XGBoost_Anti_Overfit")

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    param_grid=xgb_param_grid,
    cv=5,
    scoring='f1',
    verbose=1
)

# Fit the grid search
print("Starting XGBoost anti-overfitting tuning...")
grid_search.fit(X_train_bow, y_train)

# Get best model
best_xgb = grid_search.best_estimator_

# Log best parameters
for param, value in grid_search.best_params_.items():
    mlflow.log_param(param, value)

# Predictions
y_pred = best_xgb.predict(X_test_bow)

# Calculate metrics
metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, y_pred)
}

# Log metrics
for metric_name, metric_value in metrics.items():
    mlflow.log_metric(metric_name, metric_value)

# Log model
mlflow.sklearn.log_model(best_xgb, "XGBoost_Anti_Overfit")

# Print results
print(f"XGBoost Anti-Overfitting Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1 Score: {metrics['f1']:.4f}")
print(f"ROC AUC: {metrics['roc_auc']:.4f}")

# Feature importance
feature_importance = best_xgb.feature_importances_
top_features_idx = feature_importance.argsort()[-10:]  # Top 10 features
top_features_importance = feature_importance[top_features_idx]

print("\nTop 10 Important Features:")
for i, idx in enumerate(top_features_idx):
    print(f"Feature {idx}: {top_features_importance[i]}")

mlflow.end_run()

## fine tuning bert for fraud detection

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize the input
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

# Create a dataset class
class FraudDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = FraudDataset(train_encodings, y_train.tolist())
test_dataset = FraudDataset(test_encodings, y_test.tolist())

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Log the Hugging Face model
mlflow.pytorch.log_model(model, "bert-fraud-detection")