In [1]:
!pip install transformers==4.16 --quiet
!pip install underthesea --quiet
!pip install seaborn --quiet
!pip install numpy
!pip install torch
!pip install pandas
!pip install pyvi
!pip install gensim

[0m

In [2]:
import json
import re
import string

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, classification_report
from underthesea import word_tokenize, text_normalize

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch import optim

from transformers import AutoModel, AutoTokenizer, get_scheduler
from tqdm.auto import tqdm
from functools import partial

import seaborn as sns
import matplotlib.pyplot as plt
import requests
import gc
import random

from pyvi import ViTokenizer
from gensim.utils import simple_preprocess

from torch.utils.data import TensorDataset

In [3]:
tqdm.pandas()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# from transformers import DistilBertModel, DistilBertConfig
# config = DistilBertConfig(
#     num_labels=7,
#     vocab_size=64000,
#     max_position_embeddings=258
# )
# class StudentModel(nn.Module):
#     def __init__(self, n_classes, drop_out=0.2):
#         super(StudentModel, self).__init__()
#         self.distillbert = DistilBertModel(config)
#         self.dense = nn.Linear(768, 768)
#         self.activation = nn.Tanh()
#         self.l1 = torch.nn.Linear(768, 256)
#         self.d1 = torch.nn.Dropout(drop_out)
#         self.l2 = torch.nn.Linear(256, n_classes)
#     def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None, labels=None):
#         if inputs_embeds is None:
#             output = self.distillbert(input_ids=input_ids, attention_mask=attention_mask)
#         else:
#             output = self.distillbert(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
#         output = output[0][:, 0, :]
#         output = self.dense(output)
#         output = self.activation(output)
#         output = self.l1(output)
#         output = self.d1(output)
#         output = self.l2(output)
#         return output
# student_model = StudentModel(n_classes=7)
# student_model.to(device)

In [5]:
class StudentModel(nn.Module):
    def __init__(self, n_classes, drop_out=0.1):
        super(StudentModel, self).__init__()
        self.distillbert = AutoModel.from_pretrained("vinai/phobert-base")
        self.l1 = torch.nn.Linear(768, 256)
        self.l2 = torch.nn.Linear(256, n_classes)
        self.d1 = torch.nn.Dropout(drop_out)
    def forward(self,attention_mask, input_ids = None, inputs_embeds = None, labels=None):
        if inputs_embeds is None:
            output = self.distillbert(input_ids=input_ids, attention_mask=attention_mask)
        else:
            output = self.distillbert(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
        output = output[1]
        output = self.l1(output)
        output = self.d1(output)
        output = self.l2(output)
        return output

student_model = StudentModel(n_classes=7)
student_model.to(device)

  state_dict = torch.load(resolved_archive_file, map_location="cpu")
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


StudentModel(
  (distillbert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [6]:
class TeacherModel(nn.Module):
    def __init__(self, n_classes, drop_out=0.1):
        super(TeacherModel, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base-v2")
        self.l1 = torch.nn.Linear(768, 256)
        self.l2 = torch.nn.Linear(256, n_classes)
        self.d1 = torch.nn.Dropout(drop_out)
    def forward(self, inputs_embeds, attention_mask, labels=None):
        output = self.bert(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
        output = output[1]
        output = self.l1(output)
        output = self.d1(output)
        output = self.l2(output)
        return output

teacher_model = TeacherModel(n_classes=7)
teacher_model.to(device)
teacher_model.load_state_dict(torch.load('/workspace/teacher_model.pth'), strict = False)

Some weights of the model checkpoint at vinai/phobert-base-v2 were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  teacher_mode

_IncompatibleKeys(missing_keys=['bert.embeddings.position_ids'], unexpected_keys=[])

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertTokenizer, DistilBertModel, DistilBertConfig


hidden_size = 768 
embedding_size = hidden_size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


teacher = teacher_model
student = student_model


optimizer_student = optim.Adam(student.parameters(), lr=1e-5)

def generate_pseudo_samples(batch_size, seq_len, hidden_size, mu=0, sigma=1):
    return torch.normal(mu, sigma, size=(batch_size, seq_len, hidden_size), device=device)

def loss_knowledge_distillation(student_logits, teacher_logits):
    return nn.KLDivLoss()(torch.log_softmax(student_logits, dim=-1), torch.softmax(teacher_logits, dim=-1))


num_epochs = 5000
batch_size = 32
seq_len = 120  
best_loss = 5
for epoch in range(num_epochs):

    pseudo_samples = generate_pseudo_samples(batch_size, seq_len, hidden_size)
    attention_mask = torch.ones(pseudo_samples.shape[:2], dtype=torch.long).to(device)
    random_class_indices = torch.randint(0, 7, (batch_size,), device=device)
    one_hot_targets = F.one_hot(random_class_indices, num_classes=7).float().to(device)

    for i in range(25): 
        pseudo_samples.requires_grad_(True)
        teacher_output = teacher(inputs_embeds=pseudo_samples, attention_mask=attention_mask)

        loss = nn.CrossEntropyLoss()(teacher_output, one_hot_targets)
        gradient = torch.autograd.grad(loss, pseudo_samples)[0]
        if i < 10:
            pseudo_samples = pseudo_samples - 0.01 * gradient
        elif 10 < i < 20:
            pseudo_samples = pseudo_samples - 0.005 * gradient
        else:
            pseudo_samples = pseudo_samples - 0.001 * gradient

    optimizer_student.zero_grad()
    student_output_final = student(inputs_embeds=pseudo_samples,attention_mask=attention_mask)
    teacher_logits = teacher(inputs_embeds=pseudo_samples, attention_mask=attention_mask)
    loss_kd = loss_knowledge_distillation(student_output_final, teacher_logits)
    loss_kd.backward()
    optimizer_student.step()
    if loss_kd < best_loss :
        best_loss = loss_kd
        torch.save(student_model.state_dict(), 'student_model_PhoBERTbase.pth')
    if epoch % 100 == 0 :
        print(f"Epoch {epoch}/{num_epochs}")
        print(f"Loss KD: {loss_kd.item()}")




Epoch 0/5000
Loss KD: 0.09338464587926865
Epoch 100/5000
Loss KD: 0.019221965223550797
Epoch 200/5000
Loss KD: 0.015342439524829388
Epoch 300/5000
Loss KD: 0.013766463845968246
Epoch 400/5000
Loss KD: 0.016366884112358093
Epoch 500/5000
Loss KD: 0.019581317901611328
Epoch 600/5000
Loss KD: 0.01222248189151287
Epoch 700/5000
Loss KD: 0.016264837235212326
Epoch 800/5000
Loss KD: 0.01744380034506321
Epoch 900/5000
Loss KD: 0.017902299761772156
Epoch 1000/5000
Loss KD: 0.019484158605337143
Epoch 1100/5000
Loss KD: 0.019944138824939728
Epoch 1200/5000
Loss KD: 0.01846480928361416
Epoch 1300/5000
Loss KD: 0.011035662144422531
Epoch 1400/5000
Loss KD: 0.018236376345157623
Epoch 1500/5000
Loss KD: 0.016027409583330154
Epoch 1600/5000
Loss KD: 0.014961634762585163
Epoch 1700/5000
Loss KD: 0.015059608966112137
Epoch 1800/5000
Loss KD: 0.015872897580266
Epoch 1900/5000
Loss KD: 0.01942596770823002
Epoch 2000/5000
Loss KD: 0.013578692451119423
Epoch 2100/5000
Loss KD: 0.015193152241408825
Epoch 22

In [8]:
class_names = ['Enjoyment', 'Disgust', 'Sadness', 'Anger', 'Surprise', 'Fear', 'Other']

In [34]:
class StudentModel(nn.Module):
    def __init__(self, n_classes, drop_out=0.1):
        super(StudentModel, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.l1 = torch.nn.Linear(768, 256)
        self.l2 = torch.nn.Linear(256, n_classes)
        self.d1 = torch.nn.Dropout(drop_out)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = output[1]
        output = self.l1(output)
        output = self.d1(output)
        output = self.l2(output)
        return output

In [None]:
student_model = StudentModel(n_classes=7)
student_model.to(device)
student_model.load_state_dict(torch.load('/workspace/student_model_PhoBERTbase.pth'), strict = False)

In [36]:
def infer(text, tokenizer, max_len=120):
    print(f'Text: {text}')
    text = ' '.join(simple_preprocess(text))
    text = ViTokenizer.tokenize(text)

    encoded_review = tokenizer.encode_plus(
        text,
        max_length=max_len,
        truncation=True,
        add_special_tokens=True,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    output = student_model(input_ids, attention_mask)
    print(output.shape)
    _, y_pred = torch.max(output, dim=1)

    print(f'Sentiment: {class_names[y_pred]}')

In [37]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [33]:
infer('ngày hôm nay thật đẹp ', tokenizer)

Text: ngày hôm nay thật đẹp 
torch.Size([1, 7])
Sentiment: Enjoyment


In [13]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
[0m

In [14]:
def get_data(path):
    df = pd.read_excel(path, sheet_name=None)['Sheet1']
    df.columns = ['index', 'Emotion', 'Sentence']
    # unused column
    df.drop(columns=['index'], inplace=True)
    return df
test_df = get_data('/workspace/test_nor_811.xlsx')

In [15]:
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=120):
        self.df = df
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        """
        To customize dataset, inherit from Dataset class and implement
        __len__ & __getitem__
        __getitem__ should return
            data:
                input_ids
                attention_masks
                text
                targets
        """
        row = self.df.iloc[index]
        text, label = self.get_input_data(row)

        # Encode_plus will:
        # (1) split text into token
        # (2) Add the '[CLS]' and '[SEP]' token to the start and end
        # (3) Truncate/Pad sentence to max length
        # (4) Map token to their IDS
        # (5) Create attention mask
        # (6) Return a dictionary of outputs
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_masks': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(label, dtype=torch.long),
        }


    def labelencoder(self,text):
        if text=='Enjoyment':
            return 0
        elif text=='Disgust':
            return 1
        elif text=='Sadness':
            return 2
        elif text=='Anger':
            return 3
        elif text=='Surprise':
            return 4
        elif text=='Fear':
            return 5
        else:
            return 6

    def get_input_data(self, row):
        # Preprocessing: {remove icon, special character, lower}
        text = row['Sentence']
        text = ' '.join(simple_preprocess(text))
        text = ViTokenizer.tokenize(text)
        label = self.labelencoder(row['Emotion'])

        return text, label

In [16]:
test_dataset = SentimentDataset(test_df, tokenizer, max_len=50)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True, num_workers=2)

In [38]:
student_model.eval()
losses = []
correct = 0
with torch.no_grad():
    data_loader = test_loader
    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_masks'].to(device)
        targets = data['targets'].to(device)

        outputs = student_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, pred = torch.max(outputs, dim=1)

        correct += torch.sum(pred == targets)
        losses.append(loss.item())
print(f'Test Accuracy: {correct.double()/len(test_loader.dataset)} Loss: {np.mean(losses)}')

Test Accuracy: 0.6161616161616161 Loss: 3.285768985748291
