In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from tqdm import tqdm

file_path = '/content/drive/MyDrive/fake_reviews_dataset.csv'
df = pd.read_csv(file_path)

# Preprocessing
# Here, we are using CountVectorizer to convert text into numerical features
vectorizer = CountVectorizer(max_features=5000)
X_text = vectorizer.fit_transform(df['text_']).toarray()

# Label encoding for 'category' and 'label'
le_category = LabelEncoder()
df['category'] = le_category.fit_transform(df['category'])

le_label = LabelEncoder()
df['label'] = le_label.fit_transform(df['label'])

# Split the data into training and testing sets
X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, df['label'].values, test_size=0.2, random_state=42)
X_train_category, X_test_category = train_test_split(df['category'].values, test_size=0.2, random_state=42)
X_train_rating, X_test_rating = train_test_split(df['rating'].values, test_size=0.2, random_state=42)

# Define a custom dataset class
class FakeReviewDataset(Dataset):
    def __init__(self, text, category, rating, labels):
        self.text = torch.tensor(text, dtype=torch.float32)
        self.category = torch.tensor(category, dtype=torch.float32)
        self.rating = torch.tensor(rating, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'text': self.text[idx],
            'category': self.category[idx],
            'rating': self.rating[idx],
            'labels': self.labels[idx]
        }

# Create DataLoader for training and testing
batch_size = 32
train_dataset = FakeReviewDataset(X_train_text, X_train_category, X_train_rating, y_train)
test_dataset = FakeReviewDataset(X_test_text, X_test_category, X_test_rating, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the neural network model
class FakeReviewModel(nn.Module):
    def __init__(self, input_dim):
        super(FakeReviewModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x_text, x_category, x_rating):
        x = torch.cat([x_text, x_category.unsqueeze(1), x_rating.unsqueeze(1)], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
input_dim = X_text.shape[1] + 2  # Features from text, category, and rating
model = FakeReviewModel(input_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        text = batch['text'].to(device)
        category = batch['category'].to(device)
        rating = batch['rating'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(text, category, rating)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f'Training Loss: {average_loss:.4f}')

# Evaluate the model
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Evaluating'):
        text = batch['text'].to(device)
        category = batch['category'].to(device)
        rating = batch['rating'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(text, category, rating)
        _, predicted = torch.max(outputs, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Epoch 1/10: 100%|██████████| 1011/1011 [00:03<00:00, 277.64it/s]


Training Loss: 0.2585


Epoch 2/10: 100%|██████████| 1011/1011 [00:02<00:00, 380.26it/s]


Training Loss: 0.1513


Epoch 3/10: 100%|██████████| 1011/1011 [00:02<00:00, 378.51it/s]


Training Loss: 0.0858


Epoch 4/10: 100%|██████████| 1011/1011 [00:02<00:00, 380.04it/s]


Training Loss: 0.0410


Epoch 5/10: 100%|██████████| 1011/1011 [00:03<00:00, 291.60it/s]


Training Loss: 0.0225


Epoch 6/10: 100%|██████████| 1011/1011 [00:02<00:00, 378.19it/s]


Training Loss: 0.0147


Epoch 7/10: 100%|██████████| 1011/1011 [00:02<00:00, 373.72it/s]


Training Loss: 0.0087


Epoch 8/10: 100%|██████████| 1011/1011 [00:02<00:00, 372.73it/s]


Training Loss: 0.0065


Epoch 9/10: 100%|██████████| 1011/1011 [00:02<00:00, 344.38it/s]


Training Loss: 0.0080


Epoch 10/10: 100%|██████████| 1011/1011 [00:03<00:00, 328.46it/s]


Training Loss: 0.0053


Evaluating: 100%|██████████| 253/253 [00:00<00:00, 616.65it/s]

Test Accuracy: 91.42%





In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid

# Load the dataset
file_path = '/content/drive/MyDrive/fake_reviews_dataset.csv'
df = pd.read_csv(file_path)

# Preprocessing
vectorizer = CountVectorizer(max_features=5000)
X_text = vectorizer.fit_transform(df['text_']).toarray()

le_category = LabelEncoder()
df['category'] = le_category.fit_transform(df['category'])

le_label = LabelEncoder()
df['label'] = le_label.fit_transform(df['label'])

X_train_text, X_test_text, y_train, y_test = train_test_split(X_text, df['label'].values, test_size=0.2, random_state=42)
X_train_category, X_test_category = train_test_split(df['category'].values, test_size=0.2, random_state=42)
X_train_rating, X_test_rating = train_test_split(df['rating'].values, test_size=0.2, random_state=42)

class FakeReviewDataset(Dataset):
    def __init__(self, text, category, rating, labels):
        self.text = torch.tensor(text, dtype=torch.float32)
        self.category = torch.tensor(category, dtype=torch.float32)
        self.rating = torch.tensor(rating, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'text': self.text[idx],
            'category': self.category[idx],
            'rating': self.rating[idx],
            'labels': self.labels[idx]
        }

batch_size = 32
train_dataset = FakeReviewDataset(X_train_text, X_train_category, X_train_rating, y_train)
test_dataset = FakeReviewDataset(X_test_text, X_test_category, X_test_rating, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class FakeReviewModel(nn.Module):
    def __init__(self, input_dim):
        super(FakeReviewModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x_text, x_category, x_rating):
        x = torch.cat([x_text, x_category.unsqueeze(1), x_rating.unsqueeze(1)], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for i, batch in enumerate(train_loader):
            text = batch['text'].to(device)
            category = batch['category'].to(device)
            rating = batch['rating'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(text, category, rating)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {average_loss:.4f}')

def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            text = batch['text'].to(device)
            category = batch['category'].to(device)
            rating = batch['rating'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(text, category, rating)
            _, predicted = torch.max(outputs, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    if total == 0:
        return None
    else:
        accuracy = correct / total
        print(f'Test Accuracy: {accuracy * 100:.2f}%')
        return accuracy

# Hyperparameters
input_dim = X_text.shape[1] + 2
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Grid search for the best weight decay parameter
# Hyperparameter tuning for number of epochs, optimizer, and criterion
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'num_epochs': [10, 15],
}

best_accuracy = 0
best_params = {}

print('Hyperparameter tuning started...')

for params in tqdm(list(ParameterGrid(param_grid)), desc='Hyperparameter tuning'):
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']
    optimizer = optim.Adam
    criterion = nn.CrossEntropyLoss()

    model = FakeReviewModel(input_dim).to(device)
    optimizer_instance = optimizer(model.parameters(), lr=learning_rate)

    print(f'Training with parameters: {params}')
    train_model(model, train_loader, criterion, optimizer_instance, num_epochs, device)
    accuracy = evaluate_model(model, test_loader, device)

    if accuracy is not None and accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

print("Best hyperparameters:", best_params)
print("Best accuracy:", best_accuracy)

Hyperparameter tuning started...


Hyperparameter tuning:   0%|          | 0/6 [00:00<?, ?it/s]

Training with parameters: {'learning_rate': 0.001, 'num_epochs': 10}
Epoch 1/10, Training Loss: 0.2606
Epoch 2/10, Training Loss: 0.1530
Epoch 3/10, Training Loss: 0.0895
Epoch 4/10, Training Loss: 0.0461
Epoch 5/10, Training Loss: 0.0240
Epoch 6/10, Training Loss: 0.0119
Epoch 7/10, Training Loss: 0.0117
Epoch 8/10, Training Loss: 0.0068
Epoch 9/10, Training Loss: 0.0035
Epoch 10/10, Training Loss: 0.0048


Hyperparameter tuning:  17%|█▋        | 1/6 [00:29<02:25, 29.16s/it]

Test Accuracy: 91.55%
Training with parameters: {'learning_rate': 0.001, 'num_epochs': 15}
Epoch 1/15, Training Loss: 0.2632
Epoch 2/15, Training Loss: 0.1554
Epoch 3/15, Training Loss: 0.0906
Epoch 4/15, Training Loss: 0.0481
Epoch 5/15, Training Loss: 0.0276
Epoch 6/15, Training Loss: 0.0119
Epoch 7/15, Training Loss: 0.0120
Epoch 8/15, Training Loss: 0.0085
Epoch 9/15, Training Loss: 0.0052
Epoch 10/15, Training Loss: 0.0038
Epoch 11/15, Training Loss: 0.0062
Epoch 12/15, Training Loss: 0.0050
Epoch 13/15, Training Loss: 0.0042
Epoch 14/15, Training Loss: 0.0025
Epoch 15/15, Training Loss: 0.0034


Hyperparameter tuning:  33%|███▎      | 2/6 [01:15<02:37, 39.30s/it]

Test Accuracy: 91.29%
Training with parameters: {'learning_rate': 0.01, 'num_epochs': 10}
Epoch 1/10, Training Loss: 0.2665
Epoch 2/10, Training Loss: 0.1552
Epoch 3/10, Training Loss: 0.1040
Epoch 4/10, Training Loss: 0.0680
Epoch 5/10, Training Loss: 0.0467
Epoch 6/10, Training Loss: 0.0370
Epoch 7/10, Training Loss: 0.0266
Epoch 8/10, Training Loss: 0.0199
Epoch 9/10, Training Loss: 0.0137
Epoch 10/10, Training Loss: 0.0190


Hyperparameter tuning:  50%|█████     | 3/6 [01:44<01:43, 34.47s/it]

Test Accuracy: 91.58%
Training with parameters: {'learning_rate': 0.01, 'num_epochs': 15}
Epoch 1/15, Training Loss: 0.2627
Epoch 2/15, Training Loss: 0.1504
Epoch 3/15, Training Loss: 0.0971
Epoch 4/15, Training Loss: 0.0562
Epoch 5/15, Training Loss: 0.0408
Epoch 6/15, Training Loss: 0.0291
Epoch 7/15, Training Loss: 0.0222
Epoch 8/15, Training Loss: 0.0196
Epoch 9/15, Training Loss: 0.0179
Epoch 10/15, Training Loss: 0.0169
Epoch 11/15, Training Loss: 0.0162
Epoch 12/15, Training Loss: 0.0159
Epoch 13/15, Training Loss: 0.0131
Epoch 14/15, Training Loss: 0.0140
Epoch 15/15, Training Loss: 0.0069


Hyperparameter tuning:  67%|██████▋   | 4/6 [02:25<01:14, 37.19s/it]

Test Accuracy: 91.03%
Training with parameters: {'learning_rate': 0.1, 'num_epochs': 10}
Epoch 1/10, Training Loss: 0.7558
Epoch 2/10, Training Loss: 0.7377
Epoch 3/10, Training Loss: 0.6965
Epoch 4/10, Training Loss: 0.6969
Epoch 5/10, Training Loss: 0.6973
Epoch 6/10, Training Loss: 0.6974
Epoch 7/10, Training Loss: 0.6964
Epoch 8/10, Training Loss: 0.6974
Epoch 9/10, Training Loss: 0.6968
Epoch 10/10, Training Loss: 0.6980


Hyperparameter tuning:  83%|████████▎ | 5/6 [02:53<00:33, 33.77s/it]

Test Accuracy: 49.66%
Training with parameters: {'learning_rate': 0.1, 'num_epochs': 15}
Epoch 1/15, Training Loss: 0.5572
Epoch 2/15, Training Loss: 0.3377
Epoch 3/15, Training Loss: 0.6989
Epoch 4/15, Training Loss: 0.6998
Epoch 5/15, Training Loss: 0.6962
Epoch 6/15, Training Loss: 0.6962
Epoch 7/15, Training Loss: 0.6966
Epoch 8/15, Training Loss: 0.6956
Epoch 9/15, Training Loss: 0.6960
Epoch 10/15, Training Loss: 0.6960
Epoch 11/15, Training Loss: 0.6963
Epoch 12/15, Training Loss: 0.6963
Epoch 13/15, Training Loss: 0.6966
Epoch 14/15, Training Loss: 0.6961
Epoch 15/15, Training Loss: 0.6959


Hyperparameter tuning: 100%|██████████| 6/6 [03:35<00:00, 35.90s/it]

Test Accuracy: 49.66%
Best hyperparameters: {'learning_rate': 0.01, 'num_epochs': 10}
Best accuracy: 0.9157907753184122





In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, ParameterGrid  # Add ParameterGrid import
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from tqdm import tqdm
from gensim.models import Word2Vec

# Load the dataset
file_path = '/content/drive/MyDrive/fake_reviews_dataset.csv'
df = pd.read_csv(file_path)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_text_tfidf = tfidf_vectorizer.fit_transform(df['text_'])

# Word2Vec Embeddings
sentences = [text.split() for text in df['text_']]
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4, alpha=0.03, min_alpha=0.0007)
word2vec_model.train(sentences, total_examples=len(sentences), epochs=10)

def text_to_word2vec(text):
    words = text.split()
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if len(word_vectors) == 0:
        return np.zeros(word2vec_model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)

X_text_word2vec = np.array([text_to_word2vec(text) for text in df['text_']])
X_text_combined = np.concatenate((X_text_tfidf.toarray(), X_text_word2vec), axis=1)

le_category = LabelEncoder()
df['category'] = le_category.fit_transform(df['category'])

le_label = LabelEncoder()
df['label'] = le_label.fit_transform(df['label'])

X_train_text, X_test_text, y_train, y_test = train_test_split(X_text_combined, df['label'].values, test_size=0.2, random_state=42)
X_train_category, X_test_category = train_test_split(df['category'].values, test_size=0.2, random_state=42)
X_train_rating, X_test_rating = train_test_split(df['rating'].values, test_size=0.2, random_state=42)

class FakeReviewDataset(Dataset):
    def __init__(self, text, category, rating, labels):
        self.text = torch.tensor(text, dtype=torch.float32)
        self.category = torch.tensor(category, dtype=torch.float32)
        self.rating = torch.tensor(rating, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'text': self.text[idx],
            'category': self.category[idx],
            'rating': self.rating[idx],
            'labels': self.labels[idx]
        }

batch_size = 32
train_dataset = FakeReviewDataset(X_train_text, X_train_category, X_train_rating, y_train)
test_dataset = FakeReviewDataset(X_test_text, X_test_category, X_test_rating, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class FakeReviewModel(nn.Module):
    def __init__(self, input_dim):
        super(FakeReviewModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x_text, x_category, x_rating):
        x = torch.cat([x_text, x_category.unsqueeze(1), x_rating.unsqueeze(1)], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for i, batch in enumerate(train_loader):
            text = batch['text'].to(device)
            category = batch['category'].to(device)
            rating = batch['rating'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(text, category, rating)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {average_loss:.4f}')

def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            text = batch['text'].to(device)
            category = batch['category'].to(device)
            rating = batch['rating'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(text, category, rating)
            _, predicted = torch.max(outputs, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    if total == 0:
        return None
    else:
        accuracy = correct / total
        print(f'Test Accuracy: {accuracy * 100:.2f}%')
        return accuracy

# Hyperparameters and Grid Search
input_dim = X_text_combined.shape[1] + 2  # Adjusted for TF-IDF and word2vec
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'num_epochs': [10, 15],
}

best_accuracy = 0
best_params = {}

print('Hyperparameter tuning started...')

for params in tqdm(list(ParameterGrid(param_grid)), desc='Hyperparameter tuning'):
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']
    optimizer = optim.Adam
    criterion = nn.CrossEntropyLoss()

    model = FakeReviewModel(input_dim).to(device)
    optimizer_instance = optimizer(model.parameters(), lr=learning_rate)

    print(f'Training with parameters: {params}')
    train_model(model, train_loader, criterion, optimizer_instance, num_epochs, device)
    accuracy = evaluate_model(model, test_loader, device)

    if accuracy is not None and accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

print("Best hyperparameters:", best_params)
print("Best accuracy:", best_accuracy)



Hyperparameter tuning started...


Hyperparameter tuning:   0%|          | 0/6 [00:00<?, ?it/s]

Training with parameters: {'learning_rate': 0.001, 'num_epochs': 10}
Epoch 1/10, Training Loss: 0.2506
Epoch 2/10, Training Loss: 0.1581
Epoch 3/10, Training Loss: 0.1229
Epoch 4/10, Training Loss: 0.0958
Epoch 5/10, Training Loss: 0.0707
Epoch 6/10, Training Loss: 0.0556
Epoch 7/10, Training Loss: 0.0394
Epoch 8/10, Training Loss: 0.0348
Epoch 9/10, Training Loss: 0.0255
Epoch 10/10, Training Loss: 0.0218


Hyperparameter tuning:  17%|█▋        | 1/6 [00:30<02:34, 30.90s/it]

Test Accuracy: 92.96%
Training with parameters: {'learning_rate': 0.001, 'num_epochs': 15}
Epoch 1/15, Training Loss: 0.2470
Epoch 2/15, Training Loss: 0.1561
Epoch 3/15, Training Loss: 0.1179
Epoch 4/15, Training Loss: 0.0938
Epoch 5/15, Training Loss: 0.0687
Epoch 6/15, Training Loss: 0.0501
Epoch 7/15, Training Loss: 0.0383
Epoch 8/15, Training Loss: 0.0336
Epoch 9/15, Training Loss: 0.0252
Epoch 10/15, Training Loss: 0.0202
Epoch 11/15, Training Loss: 0.0212
Epoch 12/15, Training Loss: 0.0171
Epoch 13/15, Training Loss: 0.0131
Epoch 14/15, Training Loss: 0.0159
Epoch 15/15, Training Loss: 0.0144


Hyperparameter tuning:  33%|███▎      | 2/6 [01:11<02:26, 36.51s/it]

Test Accuracy: 92.98%
Training with parameters: {'learning_rate': 0.01, 'num_epochs': 10}
Epoch 1/10, Training Loss: 0.2461
Epoch 2/10, Training Loss: 0.1668
Epoch 3/10, Training Loss: 0.1326
Epoch 4/10, Training Loss: 0.1095
Epoch 5/10, Training Loss: 0.0906
Epoch 6/10, Training Loss: 0.0713
Epoch 7/10, Training Loss: 0.0657
Epoch 8/10, Training Loss: 0.0532
Epoch 9/10, Training Loss: 0.0481
Epoch 10/10, Training Loss: 0.0429


Hyperparameter tuning:  50%|█████     | 3/6 [01:38<01:37, 32.35s/it]

Test Accuracy: 91.57%
Training with parameters: {'learning_rate': 0.01, 'num_epochs': 15}
Epoch 1/15, Training Loss: 0.2475
Epoch 2/15, Training Loss: 0.1686
Epoch 3/15, Training Loss: 0.1343
Epoch 4/15, Training Loss: 0.1070
Epoch 5/15, Training Loss: 0.0915
Epoch 6/15, Training Loss: 0.0734
Epoch 7/15, Training Loss: 0.0629
Epoch 8/15, Training Loss: 0.0513
Epoch 9/15, Training Loss: 0.0488
Epoch 10/15, Training Loss: 0.0388
Epoch 11/15, Training Loss: 0.0352
Epoch 12/15, Training Loss: 0.0339
Epoch 13/15, Training Loss: 0.0293
Epoch 14/15, Training Loss: 0.0239
Epoch 15/15, Training Loss: 0.0248


Hyperparameter tuning:  67%|██████▋   | 4/6 [02:19<01:11, 35.74s/it]

Test Accuracy: 92.11%
Training with parameters: {'learning_rate': 0.1, 'num_epochs': 10}
Epoch 1/10, Training Loss: 0.9236
Epoch 2/10, Training Loss: 0.7256
Epoch 3/10, Training Loss: 0.6995
Epoch 4/10, Training Loss: 0.7191
Epoch 5/10, Training Loss: 0.6959
Epoch 6/10, Training Loss: 0.6966
Epoch 7/10, Training Loss: 0.6972
Epoch 8/10, Training Loss: 0.6964
Epoch 9/10, Training Loss: 0.6958
Epoch 10/10, Training Loss: 0.6971


Hyperparameter tuning:  83%|████████▎ | 5/6 [02:48<00:33, 33.29s/it]

Test Accuracy: 49.66%
Training with parameters: {'learning_rate': 0.1, 'num_epochs': 15}
Epoch 1/15, Training Loss: 0.4395
Epoch 2/15, Training Loss: 0.4379
Epoch 3/15, Training Loss: 0.7113
Epoch 4/15, Training Loss: 0.7018
Epoch 5/15, Training Loss: 0.7210
Epoch 6/15, Training Loss: 0.6976
Epoch 7/15, Training Loss: 0.6969
Epoch 8/15, Training Loss: 0.6973
Epoch 9/15, Training Loss: 0.6962
Epoch 10/15, Training Loss: 0.6969
Epoch 11/15, Training Loss: 0.6977
Epoch 12/15, Training Loss: 0.6966
Epoch 13/15, Training Loss: 0.6977
Epoch 14/15, Training Loss: 0.6967
Epoch 15/15, Training Loss: 0.6972


Hyperparameter tuning: 100%|██████████| 6/6 [03:29<00:00, 34.93s/it]

Test Accuracy: 49.66%
Best hyperparameters: {'learning_rate': 0.001, 'num_epochs': 15}
Best accuracy: 0.9297638184740942





In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import BertModel, BertTokenizer

# Load the dataset
file_path = '/content/drive/MyDrive/fake_reviews_dataset.csv'
df = pd.read_csv(file_path)

# Label Encoding
le_category = LabelEncoder()
df['category'] = le_category.fit_transform(df['category'])

le_label = LabelEncoder()
df['label'] = le_label.fit_transform(df['label'])

# Train-test split
X_train_text, X_test_text, y_train, y_test = train_test_split(df['text_'].values, df['label'].values, test_size=0.2, random_state=42)
X_train_category, X_test_category = train_test_split(df['category'].values, test_size=0.2, random_state=42)
X_train_rating, X_test_rating = train_test_split(df['rating'].values, test_size=0.2, random_state=42)

# Tokenization and Dataset Preparation for BERT
class FakeReviewBERTDataset(Dataset):
    def __init__(self, text, category, rating, labels, max_length):
        self.text = text
        self.category = category
        self.rating = rating
        self.labels = labels
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Convert the text to string format
        text_str = ' '.join([str(val) for val in self.text[idx]])

        inputs = self.tokenizer.encode_plus(
            text_str,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'category': torch.tensor(self.category[idx], dtype=torch.float32),
            'rating': torch.tensor(self.rating[idx], dtype=torch.float32),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

max_length = 128  # Adjust as needed
train_dataset = FakeReviewBERTDataset(X_train_text, X_train_category, X_train_rating, y_train, max_length)
test_dataset = FakeReviewBERTDataset(X_test_text, X_test_category, X_test_rating, y_test, max_length)

# DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# BERT Model
class FakeReviewBERTModel(nn.Module):
    def __init__(self):
        super(FakeReviewBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(self.bert.config.hidden_size + 2, 2)

    def forward(self, input_ids, attention_mask, x_category, x_rating):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = torch.cat([pooled_output, x_category.unsqueeze(1), x_rating.unsqueeze(1)], dim=1)
        x = self.fc(x)
        return x

# Training and Evaluation Functions
def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for i, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            category = batch['category'].to(device)
            rating = batch['rating'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, category, rating)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {average_loss:.4f}')

def evaluate_model(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            category = batch['category'].to(device)
            rating = batch['rating'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, category, rating)
            _, predicted = torch.max(outputs, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    if total == 0:
        return None
    else:
        accuracy = correct / total
        print(f'Test Accuracy: {accuracy * 100:.2f}%')
        return accuracy

# Hyperparameters and Initialization
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = FakeReviewBERTModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training and Evaluation
train_model(model, train_loader, criterion, optimizer, num_epochs, device)
evaluate_model(model, test_loader, device)

Epoch 1/10, Training Loss: 0.4169
Epoch 2/10, Training Loss: 0.2334
Epoch 3/10, Training Loss: 0.1729
Epoch 4/10, Training Loss: 0.1309
Epoch 5/10, Training Loss: 0.0984
Epoch 6/10, Training Loss: 0.0765
Epoch 7/10, Training Loss: 0.0617
Epoch 8/10, Training Loss: 0.0498
Epoch 9/10, Training Loss: 0.0421
Epoch 10/10, Training Loss: 0.0352
Test Accuracy: 94.83%


0.9483121058488932

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Bert_amazon.pth')

In [None]:
model = FakeReviewBERTModel().to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/Bert_amazon.pth'))

# Continue training using the loaded weights
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Additional epochs for further training
additional_epochs = 5

# Training and Evaluation
train_model(model, train_loader, criterion, optimizer, additional_epochs, device)
evaluate_model(model, test_loader, device)

Epoch 1/5, Training Loss: 0.0330
Epoch 2/5, Training Loss: 0.0268
Epoch 3/5, Training Loss: 0.0288
Epoch 4/5, Training Loss: 0.0208
Epoch 5/5, Training Loss: 0.0226
Test Accuracy: 94.55%


0.9454680351180907