In [None]:
!pip install datasets

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import RobertaTokenizer

In [None]:
# Load dataset
df = pd.read_csv("fake_reviews_dataset.csv")
df = df.dropna()  # Drop missing values

# Extract relevant columns
df = df[["category", "rating", "label", "text_"]]  # Ensure these column names match your dataset

# Convert labels to binary (0 = Fake, 1 = Genuine)
df["label"] = df["label"].apply(lambda x: 0 if x.lower() == "cg" else 1)

# Concatenate category, rating, and review text for better context
df["input_text"] = df["category"] + " " + df["rating"].astype(str) + " " + df["text_"]

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["input_text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

In [None]:
print(df.head(n=10))

In [None]:
# Tokenization
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding="max_length", max_length=512)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

In [None]:

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_labels,
})

val_dataset = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "labels": val_labels,
})

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments


In [None]:

# Load pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


In [None]:

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Added accuracy calculation
)


In [None]:
# Train the model
trainer.train()

In [None]:
# Save the trained model
model.save_pretrained("roberta_fake_review_model")
tokenizer.save_pretrained("roberta_fake_review_model")


In [None]:
!zip -r roberta_fake_review_model.zip roberta_fake_review_model
