In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

from datasets import load_dataset
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from huggingface_hub import login, HfApi




In [2]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# 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

In [4]:
dataset = load_dataset('csv', data_files={'train': 'processed_data/train.csv', 'test': 'processed_data/test.csv'})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type='torch')
tokenizer.save_pretrained(f'./fake_review_detection_model/bert_tokenizer')


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

('./fake_review_detection_model/bert_tokenizer\\tokenizer_config.json',
 './fake_review_detection_model/bert_tokenizer\\special_tokens_map.json',
 './fake_review_detection_model/bert_tokenizer\\vocab.txt',
 './fake_review_detection_model/bert_tokenizer\\added_tokens.json',
 './fake_review_detection_model/bert_tokenizer\\tokenizer.json')

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [None]:
model_name = 'bert-base-uncased'
repo_name = f"fake-review-detector-{model_name}"
hub_model_id = f"jesmine0820/{repo_name}"

api = HfApi()
api.create_repo(repo_id=repo_name, repo_type="model", exist_ok=True)

print(f"\nTraining with: {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    do_eval=True,
    eval_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=True,
    hub_model_id = hub_model_id,
    hub_strategy="every_save"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

trainer.train()
trainer.push_to_hub


Training with: bert-base-uncased


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


In [None]:
model_name = 'roberta-base'
repo_name = f"fake-review-detector-{model_name}"
hub_model_id = f"jesmine0820/{repo_name}"

api = HfApi()
api.create_repo(repo_id=repo_name, repo_type="model", exist_ok=True)

print(f"\nTraining with: {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    do_eval=True,
    eval_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=True,
    hub_model_id = hub_model_id,
    hub_strategy="every_save"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

trainer.train()
trainer.push_to_hub


In [None]:
model_name = 'google/electra-base-discriminator'
repo_name = f"fake-review-detector-{model_name}"
hub_model_id = f"jesmine0820/{repo_name}"

api = HfApi()
api.create_repo(repo_id=repo_name, repo_type="model", exist_ok=True)

print(f"\nTraining with: {model_name}")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    do_eval=True,
    eval_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=True,
    hub_model_id = hub_model_id,
    hub_strategy="every_save"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

trainer.train()
trainer.push_to_hub


In [None]:
def predict_and_visualize(trainer, test_dataset, label_names, title_prefix="Binary"):
    # Get raw predictions
    preds_output = trainer.predict(test_dataset)
    probs = torch.nn.functional.softmax(torch.tensor(preds_output.predictions), dim=1).numpy()
    preds = np.argmax(probs, axis=1)
    labels = preds_output.label_ids

    # Confusion Matrix
    cm = confusion_matrix(labels, preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f"{title_prefix} Classification - Confusion Matrix")
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(labels, probs[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"{title_prefix} Classification - ROC Curve")
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

    return preds, probs

binary_preds, binary_probs = predict_and_visualize(
    trainer,
    tokenized_datasets['test'],
    label_names=["Real", "Fake"],
    title_prefix="Fake Detection"
)


In [None]:
# from huggingface_hub import upload_file

# repo_id = "jesmine0820/fake_review_detection_model"

# # Upload FCM model
# upload_file(
#     path_or_fileobj="final_models/bert-base-uncased",
#     path_in_repo="bert-base-uncased",
#     repo_id=repo_id,
#     repo_type="model"
# )

# # Upload UMAP model
# upload_file(
#     path_or_fileobj="final_models/umap_model.pkl",
#     path_in_repo="umap_model.pkl",
#     repo_id=repo_id,
#     repo_type="model"
# )
