In [1]:
import os
import polars as pl
import py_vncorenlp
import torch
from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModel,
    logging,
)
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

logging.set_verbosity_error()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EPOCHS = 6
N_SPLITS = 10

phobert = AutoModel.from_pretrained("vinai/phobert-base-v2")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
model = AutoModelForSequenceClassification.from_pretrained(
    "NlpHUST/vibert4news-base-cased", num_labels=5
)


def seed_everything(seed_value):
    torch.manual_seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True


seed_everything(86)

In [2]:
dataset = (
    pl.read_parquet(
        "hf://datasets/khangnghiem/public/transform/company_reviews/transformed_company_reviews.parquet"
    )
    .select("segmented_review", "review_rating")
    .with_columns(
        pl.col("segmented_review")
        .list.join(" ")
        .map_elements(tokenizer.encode, return_dtype=pl.List(pl.Int32))
    )
)
print(dataset.sample(5))
# Define the lengths for train, validation, and test splits
total_length = len(dataset)
train_length = int(0.8 * total_length)  # 80% for training
val_length = int(0.1 * total_length)  # 10% for validation
test_length = total_length - train_length - val_length  # Remaining 10% for testing

# Split the dataset
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_length, val_length, test_length]
)


shape: (5, 2)
┌──────────────────┬───────────────┐
│ segmented_review ┆ review_rating │
│ ---              ┆ ---           │
│ list[i32]        ┆ f64           │
╞══════════════════╪═══════════════╡
│ [0, 6676, … 2]   ┆ 4.0           │
│ [0, 48640, … 2]  ┆ 5.0           │
│ [0, 290, … 2]    ┆ 5.0           │
│ [0, 2174, … 2]   ┆ 4.0           │
│ [0, 6676, … 2]   ┆ 1.0           │
└──────────────────┴───────────────┘


In [3]:
from sklearn.metrics import f1_score


# Define a simple PyTorch Dataset
class ReviewDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = self.data[idx]["segmented_review"]
        label = self.data[idx]["review_rating"] - 1  # Adjust labels to be 0-indexed
        return torch.tensor(review, dtype=torch.long), torch.tensor(
            label, dtype=torch.long
        )


# Create DataLoader for training, validation, and testing
train_loader = DataLoader(ReviewDataset(train_dataset), batch_size=16, shuffle=True)
val_loader = DataLoader(ReviewDataset(val_dataset), batch_size=16)
test_loader = DataLoader(ReviewDataset(test_dataset), batch_size=16)


# Define the model
class SentimentClassifier(nn.Module):
    def __init__(self, base_model, num_labels):
        super(SentimentClassifier, self).__init__()
        self.base_model = base_model
        self.classifier = nn.Linear(base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Use [CLS] token
        return logits


sentiment_model = SentimentClassifier(phobert, num_labels=5).to(device)

# Define optimizer and loss function
optimizer = AdamW(sentiment_model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(EPOCHS):
    sentiment_model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = sentiment_model(input_ids)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {total_loss / len(train_loader)}")

# Validation loop
sentiment_model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in val_loader:
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)

        logits = sentiment_model(input_ids)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

f1 = f1_score(all_labels, all_preds, average="weighted")
print(f"Validation F1 Score: {f1}")

RuntimeError: stack expects each tensor to be equal size, but got [1, 143] at entry 0 and [1, 242] at entry 1

In [None]:
model.eval()
sentence = "Chúng_tôi là những nghiên_cứu_viên ."

input_ids = torch.tensor([tokenizer.encode(sentence)])

with torch.no_grad():
    features = phobert(input_ids)  # Models outputs are now tuples

## With TensorFlow 2.0+:
# from transformers import TFAutoModel
# phobert = TFAutoModel.from_pretrained("vinai/phobert-base")


config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/537M [00:00<?, ?B/s]

In [None]:
def predict_sentiment(text):
    preprocessed = text
    inputs = tokenizer(preprocessed, return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        predicted = torch.argmax(probs, dim=1).item()

    return predicted, probs


BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 3.8362e-02,  7.0703e-01, -1.3202e-01,  ..., -9.7446e-02,
           2.5193e-01,  3.4828e-01],
         [ 2.1041e-01,  2.3984e-01,  9.1066e-03,  ..., -3.2366e-04,
          -1.7492e-01,  4.0127e-02],
         [ 2.3745e-01,  9.8413e-03, -1.6509e-01,  ..., -4.3378e-02,
          -7.5783e-02,  4.6839e-02],
         ...,
         [ 2.3041e-01,  3.7583e-01,  1.7601e-02,  ...,  7.8471e-02,
           1.8661e-01,  5.2052e-02],
         [-2.3020e-01,  5.0276e-01,  1.0913e-01,  ..., -7.3261e-02,
           1.4339e-01,  1.8320e-01],
         [ 1.8813e-01,  6.2870e-01, -2.4809e-01,  ..., -4.8115e-02,
           1.6404e-01,  4.7204e-01]]]), pooler_output=tensor([[ 6.3029e-02, -1.1201e-01, -5.8864e-02,  3.0363e-02, -1.7679e-01,
          1.0654e-01, -8.5272e-02, -1.4876e-02, -1.6943e-01,  1.4987e-01,
         -2.0341e-01,  1.7840e-01, -5.9578e-02, -3.6399e-01, -5.2988e-02,
         -1.2848e-01, -3.5642e-02,  1.3621e-01,  5.565