In [119]:
import torch
import pandas as pd

In [120]:
data1 = pd.read_csv("./datasets/fake_review_data/fake_reviews_dataset.csv")
data2 = pd.read_csv("./datasets/Amazon_Reviews_full-product/amazon_reviews.txt",delimiter="\t",header=0)

In [121]:
data1.head()

Unnamed: 0,category,rating,label,text
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [122]:
data2.head()

Unnamed: 0,DOC_ID,LABEL,RATING,VERIFIED_PURCHASE,PRODUCT_CATEGORY,PRODUCT_ID,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,__label1__,4,N,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,__label1__,4,Y,Wireless,B00LH0Y3NM,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,__label1__,3,N,Baby,B000I5UZ1Q,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,__label1__,4,N,Office Products,B003822IRA,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,__label1__,4,N,Beauty,B00PWSAXAM,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


In [123]:
data1.loc[data1["label"] == "CG", "label"] = 1
data1.loc[data1["label"] == "OR", "label"] = 0

In [124]:
data2.loc[data2["LABEL"] == "__label1__", "LABEL"] = 1
data2.loc[data2["LABEL"] == "__label2__", "LABEL"] = 0

In [125]:
DEVICE = torch.device("mps")

In [126]:
from torch import Tensor
from torch.nn import Module,Dropout,Linear,Embedding,LayerNorm,BatchNorm1d
from torch.utils.data import DataLoader, Dataset

In [127]:
class CategoryReviewDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        review = row["text"] + "[EOS]"
        label = row["label"]
        category = " ".join(row["category"].split("_")[:-1])
        rating = row["rating"]
        
        reviewEncoding = self.tokenizer(review, return_tensors="pt")
        reviewEncoding['category'] = self.tokenizer(category, return_tensors="pt")['input_ids']
        reviewEncoding['rating'] = torch.tensor(rating, dtype=torch.float32)
        reviewEncoding["label"] = torch.tensor(label)
        return reviewEncoding

In [128]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    text, categories, ratings, labels = [], [], [], []
    for sample in batch:
        text.append(sample['input_ids'].squeeze(0))
        categories.append(sample['category'].squeeze(0))
        ratings.append(sample['rating'])
        labels.append(sample["label"])
    text = pad_sequence(text,batch_first=True,padding_value=0)
    categories = pad_sequence(categories,batch_first=True,padding_value=0)
    return text, categories, torch.stack(ratings), torch.stack(labels)

In [129]:
class PositionalEncodings(Module):
    def __init__(self, d_model:int, max_len:int=512):
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len

        pe = torch.zeros((max_len, d_model))
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x:Tensor):
        return (self.pe[:x.size(1), :]).unsqueeze(0)

In [130]:
from typing import Any

class FakeReviewIdentifier(Module):
    def __init__(self,
                 model,
                 d_model:int,
                 num_classes:int,
                 vocab_size:int,
                 dropout_rate:float):
        
        super().__init__()
        self.embedding = Embedding(vocab_size, d_model)
        self.positionalEncoding = PositionalEncodings(d_model)

        self.model = model
        self.dropout = Dropout(dropout_rate)
        self.layerNorm = LayerNorm(d_model)

        self.fc1 = Linear(d_model,d_model//2)
        self.fc2 = Linear(d_model//2,num_classes)

        self.fcNum1 = Linear(1,d_model//2)
        self.fcNum2 = Linear(d_model//2,d_model)

    def forward(self, text, category, rating, attnMask):
        x = self.embedding(text) + self.positionalEncoding(text)
        xCategory = self.embedding(category) + self.positionalEncoding(category)

        xOut = self.model(x,xCategory,attnMask)

        xRatOut = self.fcNum1(rating)
        xRatOut = self.fcNum2(xRatOut)

        xRatOut = xRatOut.unsqueeze(1)
        x = torch.cat([xOut,xRatOut],dim=1)
        _,x = torch.max(x,dim=1)
        x = x.to(torch.float32)
        
        out = self.fc1(x)
        out = self.fc2(out)

        return out

In [131]:
train_data = data1.sample(frac=0.8)
val_data = data1.drop(train_data.index)
test_data = val_data.sample(frac=0.5)

In [132]:
from transformers import AutoTokenizer,AutoModel

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

train_loader = DataLoader(CategoryReviewDataset(train_data, tokenizer), batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(CategoryReviewDataset(val_data, tokenizer), batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(CategoryReviewDataset(test_data, tokenizer), batch_size=64, shuffle=True, collate_fn=collate_fn)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [133]:
class CategoryAttention(Module):
    def __init__(self,
                 d_model:int=512,
                 n_heads:int=8):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_head"
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model//n_heads

        self.Wk = Linear(d_model,d_model)
        self.Wq = Linear(d_model,d_model)
        self.Wv = Linear(d_model,d_model)
        self.Wo = Linear(d_model,d_model)

    def forward(self,query,key,value,mask=None):
        batch_size = query.size(0)

        Q = self.Wq(query)
        K = self.Wk(key)
        V = self.Wv(value)

        Q = Q.view(batch_size,-1,self.n_heads,self.head_dim).permute(0,2,1,3)
        K = K.view(batch_size,-1,self.n_heads,self.head_dim).permute(0,2,1,3)
        V = V.view(batch_size,-1,self.n_heads,self.head_dim).permute(0,2,1,3)

        energy = torch.matmul(Q,K.permute(0,1,3,2)) / torch.sqrt(torch.tensor(self.head_dim,dtype=torch.float32))
        if mask is not None:
            energy.masked_fill_(mask==0,float("-1e20"))

        attnWeights = torch.softmax(energy,dim=1)
        out = torch.matmul(attnWeights,V)
        out = out.permute(0,2,1,3).contiguous().view(batch_size,-1,self.d_model)

        out = self.Wo(out)
        return out

In [134]:
import torch.nn.functional as F

class TransformerEncoderLayer(Module):
    def __init__(self,
                 d_model:int=512,
                 n_heads:int=8,
                 dim_feedforward:int=2048,
                 dropout:float=0.1,
                 activation:Any = F.relu_):
        
        super().__init__()
        self.cat_attn = CategoryAttention(d_model,n_heads)
        
        self.fc1 = Linear(d_model,dim_feedforward)
        self.dropout = Dropout(dropout)
        
        self.fc2 = Linear(dim_feedforward,d_model)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
        
        self.activation = activation

    def forward(self,
                sent1:Tensor,
                sent2:Tensor,
                attnMask:Tensor=None):
        
        attnOut = self.cat_attn(sent1,sent2,sent2,attnMask)
        resConn = sent1 + self.dropout1(attnOut)
        normOut = self.norm1(resConn)

        FFNOutp = self.activation(self.fc1(normOut))
        FFNProd = self.fc2(self.dropout(FFNOutp))

        out = resConn + self.dropout2(FFNProd)
        out = self.norm2(out)
        return out

In [135]:
from torch.nn import ModuleList

class TransformerEncoder(Module):
    def __init__(self,
                 encoder_layer:Module,
                 num_layers:int=8,
                 d_model:int=512,
                 n_heads:int=8,
                 dim_feedforward:int=2048,
                 dropout:float=0.1,
                 activation:Any=F.relu_):
        
        super().__init__()
        self.layers = ModuleList([encoder_layer(d_model,n_heads,dim_feedforward,dropout,activation) for _ in range(num_layers)])

    def forward(self,
                sent1:Tensor,
                sent2:Tensor,
                mask:Any=None):
        out = sent1
        for layer in self.layers:
            out = layer(out,sent2,mask)
        return out

In [136]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F

d_model = 256
n_heads = 32
num_layers = 32
num_classes = 2
vocab_size = len(tokenizer)
dropout_rate = 0.2
dim_feedforward = 1024
activation = F.relu

baseModel = TransformerEncoder(
    TransformerEncoderLayer,
    num_layers,
    d_model,
    n_heads,
    dim_feedforward,
    dropout_rate,
    activation
)

model = FakeReviewIdentifier(baseModel,d_model,num_classes,vocab_size,dropout_rate)

optimizer = Adam(model.parameters(), lr=2e-3)
loss_fn = CrossEntropyLoss()

In [137]:
torch.mps.empty_cache()

In [138]:
def epoch(loader, model, loss_fn, optimizer, is_train:bool=False):
    model.train(is_train)
    total_loss = 0
    total_acc = 0
    for text, category, rating, labels in loader:
        text = text.to(DEVICE)
        category = category.to(DEVICE)
        
        rating = rating.view(-1,1).to(DEVICE)
        labels = labels.to(DEVICE)

        with torch.set_grad_enabled(is_train):
            out = model(text, category, rating, attnMask=None)
            loss = loss_fn(out, labels)
            _,preds = torch.max(out,dim=1)
            acc = (preds==labels).sum().item()/labels.size(0)
            if is_train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            total_loss += loss.item()
            total_acc += acc
    return total_loss / len(loader), acc / len(loader)

In [139]:
num_epochs = 100
train_losses, val_losses = [], []
patience = 10
min_val_loss = 1e9
bad_epochs = 0

for currEpoch in range(num_epochs):
    model.to(DEVICE)
    
    train_loss,train_acc = epoch(train_loader, model, loss_fn, optimizer, is_train=True)
    val_loss,val_acc = epoch(val_loader, model, loss_fn, optimizer, is_train=False)

    print(f"Epoch {currEpoch+1}")
    print(f"Train Loss: {train_loss:.4f} Train Accuracy: {train_acc:.4f} | Val Loss: {val_loss:.4f} Val Accuracy{val_acc:.4f}")

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    if val_loss < min_val_loss:
        min_val_loss = val_loss
        bad_epochs = 0
    else:
        bad_epochs += 1
    if bad_epochs == patience:
        break

RuntimeError: MPS backend out of memory (MPS allocated: 18.14 GB, other allocations: 704.00 KB, max allowed: 18.13 GB). Tried to allocate 125.25 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).