In [69]:
import torch
import pandas as pd

In [70]:
data1 = pd.read_csv("./datasets/fake_review_data/fake_reviews_dataset.csv")
data2 = pd.read_csv("./datasets/Amazon_Reviews_full-product/amazon_reviews.txt",delimiter="\t",header=0)

In [71]:
data1.head()

Unnamed: 0,category,rating,label,text
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [72]:
data2.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


In [73]:
data1.loc[data1["label"] == "CG", "label"] = 1
data1.loc[data1["label"] == "OR", "label"] = 0

In [74]:
data2.loc[data2["LABEL"] == "__label1__", "LABEL"] = 1
data2.loc[data2["LABEL"] == "__label2__", "LABEL"] = 0

In [75]:
DEVICE = torch.device("mps")

In [76]:
from torch import Tensor
from torch.nn import Module,Dropout,Linear,Embedding,LayerNorm,BatchNorm1d
from torch.utils.data import DataLoader, Dataset

In [77]:
class CategoryReviewDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        review = row["text"] + "[EOS]"
        label = row["label"]
        category = " ".join(row["category"].split("_")[:-1])
        rating = row["rating"]
        
        reviewEncoding = self.tokenizer(review, return_tensors="pt")
        reviewEncoding['category'] = self.tokenizer(category, return_tensors="pt")['input_ids']
        reviewEncoding['rating'] = torch.tensor(rating, dtype=torch.float32)
        reviewEncoding["label"] = torch.tensor(label)
        return reviewEncoding

In [78]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    text, categories, ratings, labels = [], [], [], []
    for sample in batch:
        text.append(sample['input_ids'].squeeze(0))
        categories.append(sample['category'].squeeze(0))
        ratings.append(sample['rating'])
        labels.append(sample["label"])
    text = pad_sequence(text,batch_first=True,padding_value=0)
    categories = pad_sequence(categories,batch_first=True,padding_value=0)
    return text, categories, torch.stack(ratings), torch.stack(labels)

In [79]:
class PositionalEncodings(Module):
    def __init__(self, d_model:int, max_len:int=512):
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len

        pe = torch.zeros((max_len, d_model))
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x:Tensor):
        return (self.pe[:x.size(1), :]).unsqueeze(0)

In [80]:
from typing import Any

class FakeReviewIdentifier(Module):
    def __init__(self,
                 model,
                 d_model:int,
                 num_classes:int,
                 vocab_size:int,
                 dropout_rate:float):
        
        super().__init__()
        self.embedding = Embedding(vocab_size, d_model)
        self.positionalEncoding = PositionalEncodings(d_model)

        self.model = model
        self.dropout = Dropout(dropout_rate)
        self.layerNorm = LayerNorm(d_model)

        self.fc1 = Linear(d_model,d_model//2)
        self.fc2 = Linear(d_model//2,num_classes)

        self.fcNum1 = Linear(1,d_model//2)
        self.fcNum2 = Linear(d_model//2,d_model)

    def forward(self, text, category, rating, attention_mask):
        xOut = self.model(text, attention_mask=attention_mask)
        xOut = self.layerNorm(xOut)
        xOut = xOut.pooler_output
        xOut = self.dropout(xOut)

        xCatOut = self.model(category)
        xCatOut = self.layerNorm(xCatOut)
        xCatOut = xCatOut.pooler_output
        xCatOut = self.dropout(xCatOut)

        xRatOut = self.fcNum1(rating)
        xRatOut = self.fcNum2(xRatOut)

        x = torch.cat([xOut,xRatOut],dim=1)
        out = self.fc1(x)
        out = self.fc2(out)

        return out

In [81]:
train_data = data1.sample(frac=0.8)
val_data = data1.drop(train_data.index)
test_data = val_data.sample(frac=0.5)

In [82]:
from transformers import AutoTokenizer,AutoModel

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

train_loader = DataLoader(CategoryReviewDataset(train_data, tokenizer), batch_size=128, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(CategoryReviewDataset(val_data, tokenizer), batch_size=128, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(CategoryReviewDataset(test_data, tokenizer), batch_size=128, shuffle=True, collate_fn=collate_fn)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [83]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

preTrainedModel = AutoModel.from_pretrained("bert-base-cased")

for param in preTrainedModel.parameters():
    param.requires_grad = False

d_model = preTrainedModel.config.hidden_size
num_classes = 2
vocab_size = len(tokenizer)
dropout_rate = 0.1

model = FakeReviewIdentifier(preTrainedModel,d_model,num_classes,vocab_size,dropout_rate)

optimizer = Adam(model.parameters(), lr=1e-3)
loss_fn = CrossEntropyLoss()

In [84]:
def epoch(loader, model, loss_fn, optimizer, is_train:bool=False):
    model.train(is_train)
    total_loss = 0
    for text, category, rating, labels in loader:
        text = text.to(DEVICE)
        category = category.to(DEVICE)
        
        rating = rating.to(DEVICE)
        labels = labels.to(DEVICE)

        with torch.set_grad_enabled(is_train):
            out = model(text, category, rating, attention_mask=(text != 0))
            loss = loss_fn(out, labels)
            if is_train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            total_loss += loss.item()
    return total_loss / len(loader)

In [85]:
num_epochs = 100
train_losses, val_losses = [], []
patience = 5
min_val_loss = 1e9
bad_epochs = 0

for currEpoch in range(num_epochs):
    model.to(DEVICE)
    
    train_loss = epoch(train_loader, model, loss_fn, optimizer, is_train=True)
    val_loss = epoch(val_loader, model, loss_fn, optimizer, is_train=False)

    print(f"Epoch {currEpoch+1} | Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f}")

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    if val_loss < min_val_loss:
        min_val_loss = val_loss
        bad_epochs = 0
    else:
        bad_epochs += 1
    if bad_epochs == patience:
        break

RuntimeError: MPS backend out of memory (MPS allocated: 14.23 GB, other allocations: 3.16 GB, max allowed: 18.13 GB). Tried to allocate 1.40 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).