In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd "/content/drive/MyDrive"

In [2]:
# %cd "/content/drive/MyDrive/DM-Kaggle"

In [2]:
import torch
import numpy as np
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'joy':0,
      'anger':1,
      'sadness':2,
      'anticipation':3,
      'disgust':4,
      'trust':5,
      'fear':6,
      'surprise':7
      }

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['emotion']]
        self.texts = [tokenizer(text, padding='max_length', max_length = 128, truncation=True, return_tensors="pt")
                      for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [4]:
import pandas as pd

In [5]:
train_data_path = './train_raw_data.csv'
chunk_size = 1000
chunks = []
for chunk in pd.read_csv(train_data_path, chunksize=chunk_size, iterator=True):
    chunks.append(chunk)
train_df = pd.concat(chunks, ignore_index=True)

In [6]:
train_df = train_df.dropna(axis=0, subset=['text', 'emotion'])

In [7]:
test_data_path = './test_raw_data.csv'
chunk_size = 1000
chunks = []
for chunk in pd.read_csv(test_data_path, chunksize=chunk_size, iterator=True):
    chunks.append(chunk)
test_df = pd.concat(chunks, ignore_index=True)

In [8]:
np.random.seed(112)
df_train, df_val = np.split(train_df.sample(frac=1, random_state=42), [int(.8*len(train_df))])
df_test = test_df

print(len(df_train),len(df_val), len(df_test))

  return bound(*args, **kwds)


1161422 290356 414817


In [9]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 8)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [11]:
from torch.optim import Adam
from tqdm import tqdm
import os

model_dir = './checkpoints'
def train(model, train_data, val_data, learning_rate, epochs):
  # 通过Dataset类获取训练和验证集
    train, val = Dataset(train_data), Dataset(val_data)
    # DataLoader根据batch_size获取数据，训练时选择打乱样本
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=64, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=64)
  # 判断是否使用GPU
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # 定义损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    # 开始进入训练循环
        min_loss = 999
    for epoch_num in range(epochs):
      # 定义两个变量，用于存储训练集的准确率和损失
        total_acc_train = 0
        total_loss_train = 0
      # 进度条函数tqdm
        for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)
        # 通过模型得到输出
                output = model(input_id, mask)
                # 计算损失
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                # 计算精度
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc
        # 模型更新
                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
        if batch_loss < min_loss:
                min_loss = batch_loss
                torch.save(model.state_dict(), os.path.join(model_dir , 'GPT_best.pt'))
                torch.save(model, os.path.join(model_dir , 'GPT_best_model.pt'))
            # ------ 验证模型 -----------
            # 定义两个变量，用于存储验证集的准确率和损失
        total_acc_val = 0
        total_loss_val = 0
      # 不需要计算梯度
        with torch.no_grad():
                # 循环获取数据集，并用训练好的模型进行验证
                for val_input, val_label in val_dataloader:
          # 如果有GPU，则使用GPU，接下来的操作同训练
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

        print(
                f'''Epochs: {epoch_num + 1}
              | Train Loss: {total_loss_train / len(train_data): .3f}
              | Train Accuracy: {total_acc_train / len(train_data): .3f}
              | Val Loss: {total_loss_val / len(val_data): .3f}
              | Val Accuracy: {total_acc_val / len(val_data): .3f}''')

In [12]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 18148/18148 [1:23:40<00:00,  3.61it/s]


Epochs: 1
              | Train Loss:  0.021
              | Train Accuracy:  0.517
              | Val Loss:  0.019
              | Val Accuracy:  0.578


100%|██████████| 18148/18148 [1:23:34<00:00,  3.62it/s]


Epochs: 2
              | Train Loss:  0.018
              | Train Accuracy:  0.597
              | Val Loss:  0.017
              | Val Accuracy:  0.605


100%|██████████| 18148/18148 [1:24:13<00:00,  3.59it/s]


Epochs: 3
              | Train Loss:  0.017
              | Train Accuracy:  0.621
              | Val Loss:  0.017
              | Val Accuracy:  0.617


100%|██████████| 18148/18148 [1:24:52<00:00,  3.56it/s]


Epochs: 4
              | Train Loss:  0.016
              | Train Accuracy:  0.637
              | Val Loss:  0.016
              | Val Accuracy:  0.625


100%|██████████| 18148/18148 [1:25:52<00:00,  3.52it/s]


Epochs: 5
              | Train Loss:  0.015
              | Train Accuracy:  0.651
              | Val Loss:  0.016
              | Val Accuracy:  0.630


In [13]:
torch.save(model.state_dict(), os.path.join(model_dir , 'BERT_final.pt'))
torch.save(model, os.path.join(model_dir , 'BERT_final_model.pt'))

In [14]:
model_save = model

In [10]:
model = torch.load("./checkpoints/BERT_final_model.pt")

In [21]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        # self.labels = [labels[label] for label in df['emotion']]
        # self.ids = df['tweet_id']
        self.texts = [tokenizer(text, padding='max_length', max_length = 128, truncation=True, return_tensors="pt")
                      for text in df['text']]

    # def classes(self):
    #     return self.texts

    def __len__(self):
        return len(self.texts)

    # def get_batch_ids(self, idx):
    #     # Fetch a batch of labels
    #     return np.array(self.ids[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        # batch_ids = self.get_batch_ids(idx)
        return batch_texts

In [11]:
reverse_labels = {value: key for key, value in labels.items()}
print(reverse_labels)

{0: 'joy', 1: 'anger', 2: 'sadness', 3: 'anticipation', 4: 'disgust', 5: 'trust', 6: 'fear', 7: 'surprise'}


In [24]:
# def evaluate(model, test_data):
#     test = TestDataset(test_data)
#     test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)
#     use_cuda = torch.cuda.is_available()
#     device = torch.device("cuda" if use_cuda else "cpu")
#     if use_cuda:
#         model = model.cuda()

#     total_acc_test = 0
#     with torch.no_grad():
#         for test_input in tqdm(test_dataloader):
#             mask = test_input['attention_mask'].to(device)
#             input_id = test_input['input_ids'].squeeze(1).to(device)
#             output = model(input_id, mask)
#             emotion_encoding = output.argmax(dim=1)
#             emotion = reverse_labels.get(emotion_encoding)
#             # print(test_id)
#             print(test_input)
#             print(emotion)
#             break
            
    
# evaluate(model, test_df)

In [12]:
from tqdm import trange

In [41]:
def evaluate(model, test_data):
    # test = TestDataset(test_data)
    # test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    prediction_list = []
    with torch.no_grad():
        for index in trange(len(test_data)):
            try:
                test_input = tokenizer(test_data['text'][index], padding='max_length', max_length = 128, truncation=True, return_tensors="pt")
                mask = test_input['attention_mask'].to(device)
                input_id = test_input['input_ids'].squeeze(1).to(device)
                output = model(input_id, mask)
                emotion_encoding = output.argmax(dim=1)
                emotion = reverse_labels.get(emotion_encoding.cpu().item())
            except: 
                emotion = 'joy'
            dict = {'id': test_data['tweet_id'][index], 'emotion': emotion}
            prediction_list.append(dict)
            # print(test_id)
    prediction = pd.DataFrame(prediction_list)
    return prediction

In [42]:
test_df['text'][0]

'Confident of your obedience, I write to you, knowing that you will do even more than I ask. (Philemon 1:21) 3/4 #bibleverse <LH> <LH>'

In [43]:
prediction = evaluate(model, test_df)

100%|██████████| 414817/414817 [19:03<00:00, 362.88it/s] 


In [32]:
print(len(test_df))

414817


In [22]:
print(prediction.head())

         id       emotion
0  0x28b412  anticipation
1  0x2de201         trust
2  0x218443           joy
3  0x2939d5  anticipation
4  0x26289a         trust


In [34]:
prediction_ver1 = prediction

In [44]:
print(len(prediction_ver1))

411972


In [28]:
prediction_ver1 = prediction_ver1.drop_duplicates(subset=['id'])

In [45]:
prediction_ver1 = prediction_ver1.dropna(subset=['id'])
prediction_ver1 = prediction_ver1.dropna(subset=['emotion'])

In [40]:
print(len(prediction))

414817


In [46]:
prediction_ver1.to_csv('submission.csv', index=False)

In [24]:
! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [47]:
! kaggle competitions submit -c dm2023-isa5810-lab2-homework -f submission.csv -m "BERT"

100%|██████████████████████████████████████| 6.19M/6.19M [00:03<00:00, 2.01MB/s]
Successfully submitted to DM2023 ISA5810 Lab2 Homework