In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import ParameterGrid
from huggingface_hub import login, HfApi

  from .autonotebook import tqdm as notebook_tqdm





In [4]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Load CSV files using pandas
train_df = pd.read_csv('processed_data/train.csv')
test_df = pd.read_csv('processed_data/test.csv')

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

train_encodings = tokenize_texts(train_df['text'])
test_encodings = tokenize_texts(test_df['text'])

train_labels = torch.tensor(train_df['label'].values)
test_labels = torch.tensor(test_df['label'].values)

class ReviewDataset():
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = ReviewDataset(train_encodings, train_labels)
eval_dataset = ReviewDataset(test_encodings, test_labels)

In [9]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
    }

# Hyperparameter grid
param_grid = {
    "learning_rate": [2e-5, 3e-5],
    "per_device_train_batch_size": [32],
    "num_train_epochs": [3, 5],
}

model_configs = [
    {
        "model_name": "bert-base-uncased",
        "repo_name": "fake-review-detector-bert-base-uncased"
    },
    {
        "model_name": "roberta-base",
        "repo_name": "fake-review-detector-roberta-base"
    },
    {
        "model_name": "google/electra-base-discriminator",
        "repo_name": "fake-review-detector-google"
    }
]

# Store all results
results = []

for config in model_configs:
    model_name = config["model_name"]
    repo_name = config["repo_name"]
    hub_model_id = f"jesmine0820/{repo_name}"

    print(f"\n----------- Training with: {model_name}")

    # Create repo
    api = HfApi()
    api.create_repo(repo_id=repo_name, repo_type="model", exist_ok=True)

    best_accuracy = 0
    best_metrics = None

    for params in ParameterGrid(param_grid):
        print(f"\nTrying params: {params}")

        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

        training_args = TrainingArguments(
            output_dir=f"./results/{repo_name.replace('/', '_')}_{params['learning_rate']}_{params['per_device_train_batch_size']}",
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=params["learning_rate"],
            per_device_train_batch_size=params["per_device_train_batch_size"],
            per_device_eval_batch_size=32,
            num_train_epochs=params["num_train_epochs"],
            weight_decay=0.01,
            logging_dir="./logs",
            push_to_hub=True,
            hub_model_id=hub_model_id,
            hub_strategy="every_save",
            report_to="none",  # disable wandb
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        trainer.train()
        eval_metrics = trainer.evaluate()

        if eval_metrics["eval_accuracy"] > best_accuracy:
            best_accuracy = eval_metrics["eval_accuracy"]
            best_metrics = eval_metrics
            best_metrics["model_name"] = model_name
            best_metrics["repo_name"] = repo_name
            best_metrics["best_params"] = params
            trainer.push_to_hub()

    results.append(best_metrics)


----------- Training with: bert-base-uncased

Trying params: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32}


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [None]:
# Output final result
df_results = pd.DataFrame(results)
print("\n >> Final Evaluation Results:\n")
df_results.head()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load tokenizer and model
model_id = "jesmine0820/fake-review-detector-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(model_id)
model.eval()

# Label mapping
id2label = {0: "true", 1: "fake"}

# Sample test reviews
texts = [
    # True reviews
    "I received my order yesterday and everything works perfectly. Highly recommended.",
    "The phone arrived earlier than expected, and the packaging was great. Would buy again!",
    "Customer service was responsive and helped me resolve an issue within minutes.",
    "I've been using this laptop for two weeks, and performance is solid for the price.",

    # Fake reviews
    "Best product ever! Life changing! I will never use anything else again!",
    "Amazing amazing amazing. Five stars five stars five stars!",
    "This is the most wonderful thing I’ve ever purchased. 100% satisfied.",
    "Good product. Fast delivery. Good product. Fast delivery.",

    # Edge cases
    "It’s okay, but not as good as expected.",
    "Worked for a few days, then stopped. I might return it.",
    "Not sure if it's authentic, but it looks fine to me."
]

# Predict and print results
for i, text in enumerate(texts, 1):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=1)
        predicted_class_id = torch.argmax(probs, dim=1).item()
        confidence = probs[0][predicted_class_id].item()
        predicted_label = id2label.get(predicted_class_id, str(predicted_class_id))

    print(f"\nExample {i}")
    print(f"Text: {text}")
    print(f"Predicted Label: {predicted_label}")
    print(f"Confidence: {confidence:.4f}")
    print(f"Probabilities: {probs.tolist()[0]}")
