<a href="https://colab.research.google.com/github/indusree123/Fake-review-Detection/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load your dataset
reviews_data = pd.read_csv("/content/drive/MyDrive/amazonreviews.csv", encoding="latin1")

# Convert boolean column to integer
reviews_data['verified_purchase'] = reviews_data['verified_purchase'].astype(int)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews_data[['review_text']], reviews_data['verified_purchase'], test_size=0.2, random_state=42)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and prepare data for BERT
def tokenize_data(text_list, labels):
    input_ids = []
    attention_masks = []

    for text in text_list.fillna(''):
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens = True,
                            max_length = 64,
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,
                            return_tensors = 'pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values)
    return input_ids, attention_masks, labels


X_train_input_ids, X_train_attention_masks, y_train_labels = tokenize_data(X_train['review_text'], y_train)
X_test_input_ids, X_test_attention_masks, y_test_labels = tokenize_data(X_test['review_text'], y_test)

# Create DataLoader
batch_size = 32

train_data = TensorDataset(X_train_input_ids, X_train_attention_masks, y_train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(X_test_input_ids, X_test_attention_masks, y_test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_train_accuracy = 0.0

    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_train_accuracy += (outputs.logits.argmax(dim=1) == inputs["labels"]).float().sum().item()

    avg_train_accuracy = total_train_accuracy / len(X_train_input_ids)

    # Evaluation loop
    model.eval()
    total_eval_accuracy = 0.0

    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        total_eval_accuracy += (outputs.logits.argmax(dim=1) == inputs["labels"]).float().sum().item()

    avg_eval_accuracy = total_eval_accuracy / len(X_test_input_ids)

    print(f"Epoch {epoch+1}/{epochs}: Train Accuracy: {avg_train_accuracy}, Eval Accuracy: {avg_eval_accuracy}")


# Calculate overall accuracy
with torch.no_grad():
    model.eval()
    predictions = []

    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}
        outputs = model(**inputs)
        predictions.extend(outputs.logits.argmax(dim=1).cpu().detach().numpy())

overall_accuracy = accuracy_score(y_test_labels, predictions)
print(f"Overall Accuracy: {overall_accuracy}")

# Calculate precision, recall, and F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test_labels, predictions, average='binary')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1_score}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3: Train Accuracy: 0.7831149927219796, Eval Accuracy: 0.8052325581395349
Epoch 2/3: Train Accuracy: 0.8500727802037845, Eval Accuracy: 0.8313953488372093
Epoch 3/3: Train Accuracy: 0.8646288209606987, Eval Accuracy: 0.8488372093023255
Overall Accuracy: 0.8488372093023255
Precision: 0.8554913294797688
Recall: 0.8457142857142858
F1-score: 0.8505747126436781


In [None]:
import matplotlib.pyplot as plt

colors = ['#79BAEC','#FED8B1']
plt.figure(figsize=(4,4))
label = df['verified_purchase'].value_counts()
plt.pie(label.values, colors=colors, labels=label.index, autopct='%1.1f%%', startangle=90)
plt.title('True and False Reviews Count', fontsize=15)
plt.show()


NameError: name 'df' is not defined

<Figure size 400x400 with 0 Axes>