In [None]:
! pip install -qq transformers
! pip install -qq --upgrade xlrd

In [None]:
import numpy as np
np.random.seed(0)
import pandas as pd

import os
from tqdm import tqdm

from collections import defaultdict
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP

from _thread import start_new_thread
from functools import wraps
import traceback

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import copy
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')
root = 'drive/MyDrive/Colab Notebooks/BertLSTMCLF'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class SentimentClassifier(nn.Module):
    
    def __init__(self, hidden_dim, n_layers, n_classes):
        super().__init__()
        # BERT layers
        self.bert = BertModel.from_pretrained("bert-base-chinese") # pretrained_model_name_or_path = "."
        # LSTM layers
        self.lstm = nn.LSTM(
            input_size = self.bert.config.hidden_size, 
            hidden_size = hidden_dim,
            num_layers = n_layers, 
            batch_first=True, 
            bidirectional=True
        )

        self.drop = nn.Dropout(p=0.5)
        self.out = nn.Sequential(
            nn.Linear(hidden_dim * 2 * n_layers, hidden_dim), # Linear layer
            nn.Linear(hidden_dim, hidden_dim),
            nn.ELU(),
            nn.Linear(hidden_dim, n_classes)
        )

    def forward(self, encoded_input):
        output = self.bert(**encoded_input)
        # print(output.last_hidden_state.shape)
        embedded = output.last_hidden_state

        self.lstm.flatten_parameters() # for warning below
        # UserWarning: RNN module weights are not part of single contiguous chunk of memory. 
        # This means they need to be compacted at every call, possibly greatly increasing memory usage. 
        # To compact weights again call flatten_parameters(). 
        # text -> [batch_size, sequence_length]
        # embedded = self.embedding(text) # Create embedding of the input text  text -> [batch_size, sequence_length, emb_dim]
        # Handle padding to ignore padding during training of the RNN
        
        # disable the pack_padded_sequence function for the ability to handle empty token list
        # packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True, enforce_sorted=False)
        output, (hidden, cell) = self.lstm(embedded) # hidden -> [num_direction*num_layers, batch_size, emb_dim]
     
        # hidden -> [batch_size, emb_dim*num_direction*num_layers] 
        hidden = hidden.permute(1,0,2).reshape(output.shape[0], -1) # Concatenate the forward and backward hidden state of each layer            

        output = self.out(hidden) # [batch_size, n_classes]

        return output

In [None]:
data = pd.read_excel(f"{root}/final.xls")

In [None]:
# create model and move it to GPU with id rank
moods = {0: '喜悦', 1: '愤怒', 2: '厌恶', 3: '低落'}

model = SentimentClassifier(hidden_dim=256, n_layers=2, n_classes=4)
model.load_state_dict(torch.load(f'{root}/3_0.6505.pt'))
model = model.to(device)
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path = "bert-base-chinese") # bert-base-uncased

model.eval()
with torch.no_grad():
    eval_moods = []
    for row_idx, row_content in tqdm(data.iterrows()):
        text = list(filter(
            lambda x: len(x)>5, 
            (row_content['translation'] \
            + '\n' + row_content['appreciation'] \
            + '\n' + row_content['background']).split('\n')
        ))
        if len(text)> 1:
            # print(text)
            encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=256)
            encoded_input = {k:v.to(device) for k,v in encoded_input.items()}
            output = model(encoded_input).softmax(dim=1).mean(0).cpu().numpy()
            eval_moods.append(output)
            # print(output)
            # break
        else:
            eval_moods.append(np.zeros(4))

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
6798it [32:14,  3.51it/s]


In [None]:
text=['我好难过啊，今天好倒霉']
print(text)
with torch.no_grad():
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=256)
    encoded_input = {k:v.to(device) for k,v in encoded_input.items()}
    output = model(encoded_input).softmax(dim=1).mean(0)
    print(output)

text=['我好开心啊！！！今天中奖了！']
print(text)
with torch.no_grad():
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=256)
    encoded_input = {k:v.to(device) for k,v in encoded_input.items()}
    output = model(encoded_input).softmax(dim=1).mean(0)
    print(output)

text=['想不通怎么可以这样呢？这是人做的事吗？']
print(text)
with torch.no_grad():
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=256)
    encoded_input = {k:v.to(device) for k,v in encoded_input.items()}
    output = model(encoded_input).softmax(dim=1).mean(0)
    print(output)

text=['这也太恶心了']
print(text)
with torch.no_grad():
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=256)
    encoded_input = {k:v.to(device) for k,v in encoded_input.items()}
    output = model(encoded_input).softmax(dim=1).mean(0)
    print(output)

['我好难过啊，今天好倒霉']
tensor([0.0050, 0.0343, 0.4493, 0.5114], device='cuda:0')
['我好开心啊！！！今天中奖了！']
tensor([0.8622, 0.1034, 0.0173, 0.0170], device='cuda:0')
['想不通怎么可以这样呢？这是人做的事吗？']
tensor([0.1087, 0.8396, 0.0273, 0.0245], device='cuda:0')
['这也太恶心了']
tensor([0.0864, 0.5080, 0.2079, 0.1977], device='cuda:0')


In [None]:
pd.DataFrame(data=eval_moods, columns = list(moods.values())).to_csv(f'{root}/BertBiLSTM_Eval_Moods.csv')