**Connect with Google Drive**

In [1]:
!/opt/bin/nvidia-smi

Mon Mar  7 17:03:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
dataPath = '/content/gdrive/MyDrive/四上/AI_cup/'
bertpath = '/content/gdrive/MyDrive/四上/AI_cup/forbert/'

**Data processing**  

In [4]:
import pandas as pd
dataset = pd.read_csv(dataPath+'task1_trainset.csv', dtype=str)
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)

In [5]:
dataset['Abstract'] = ['[CLS]' + sent + '[SEP]' for sent in dataset['Abstract']]
dataset

Unnamed: 0,Id,Abstract,Task 1
0,D00001,[CLS]Rapid popularity of Internet of Things (I...,BACKGROUND OBJECTIVES METHODS METHODS RESULTS ...
1,D00002,"[CLS]In this paper, we address the problem of ...",OBJECTIVES OTHERS METHODS/RESULTS RESULTS RESULTS
2,D00003,[CLS]High quality upsampling of sparse 3D poin...,BACKGROUND OBJECTIVES METHODS METHODS METHODS ...
3,D00004,[CLS]Internet is the main source of informatio...,BACKGROUND BACKGROUND BACKGROUND OBJECTIVES OB...
4,D00005,[CLS]Automated Facial Expression Recognition (...,BACKGROUND BACKGROUND BACKGROUND BACKGROUND ME...
...,...,...,...
6995,D06996,[CLS]We have witnessed the discovery of many t...,BACKGROUND BACKGROUND OBJECTIVES METHODS METHO...
6996,D06997,[CLS]State-of-the-art slot filling models for ...,BACKGROUND OBJECTIVES METHODS METHODS CONCLUSI...
6997,D06998,[CLS]Real-time semantic segmentation plays an ...,BACKGROUND OBJECTIVES OBJECTIVES METHODS CONCL...
6998,D06999,[CLS]We propose a neural embedding algorithm c...,OBJECTIVES METHODS METHODS RESULTS RESULTS CON...


In [6]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
trainset, validset = train_test_split(dataset, test_size=0.1, random_state=42)

trainset.to_csv(bertpath+'trainset.csv',index=False)
validset.to_csv(bertpath+'validset.csv',index=False)

In [7]:
# Do the same things for test data
dataset = pd.read_csv(dataPath+'task1_private_testset.csv', dtype=str)
dataset.drop('Title',axis=1,inplace=True)
dataset.drop('Categories',axis=1,inplace=True)
dataset.drop('Created Date',axis=1, inplace=True)
dataset.drop('Authors',axis=1,inplace=True)
dataset['Abstract'] = ['[CLS]' + sent + '[SEP]' for sent in dataset['Abstract']]
dataset

Unnamed: 0,Id,Abstract
0,T20001,[CLS]Many real-world analytics problems involv...
1,T20002,[CLS]The spectrum of a first-order logic sente...
2,T20003,"[CLS]In this paper, we propose a novel suffici..."
3,T20004,[CLS]As the features from the traditional Loca...
4,T20005,[CLS]Designing search algorithms for finding g...
...,...,...
19995,T39996,[CLS]We are investigating on-line model-based ...
19996,T39997,"[CLS]In recent years, several Scientometrics a..."
19997,T39998,[CLS]A method for segmenting water bodies in o...
19998,T39999,[CLS]Building intelligent agents that can comm...


In [8]:
dataset.to_csv(bertpath+'testset.csv',index=False)

In [9]:
# colab doesn't have all package in nltk, we need to download by ourselves.
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np
import spacy
nlp = spacy.load('en_core_web_sm')

### Bert Pretrained Tokeniztion

In [11]:
!pip install transformers==4.5.0



In [12]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased' , do_lower_case = True)
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base' , do_lower_case = True)
tokenize = tokenizer.tokenize
# tokenizer_texts = [tokenizer.tokenize(sent) for sent in training['Sentence']]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
def collect_words(data_path):
  df = pd.read_csv(data_path, dtype=str)
  tokens = set()
  for i in df.iterrows():
    sents = i[1]['Abstract'].split('$$$')
    sents = ' '.join(sents)
    tokens |= set(tokenize(sents))
  return tokens

words = set()
words |= collect_words(bertpath+'trainset.csv')

print(words)
print(len(words))

14665


In [14]:
PAD_TOKEN = 0
UNK_TOKEN = 1
word_dict = {'<pad>':PAD_TOKEN,'<unk>':UNK_TOKEN}
for word in words:
  word_dict[word]=len(word_dict)

In [15]:
import pickle
with open(bertpath+'dicitonary.pkl','wb') as f:
  pickle.dump(word_dict, f)

In [16]:
"""
load word dictionary
"""
import pickle
with open(bertpath+'dicitonary.pkl','rb') as f:
  word_dict = pickle.load(f)

In [17]:
def get_dataset(data_path, word_dict, n_workers=4):

  dataset = pd.read_csv(data_path, dtype=str)
  formatData = []
  for (idx,data) in dataset.iterrows():
    processed = {}
    processed['Abstract'] = [sentence_to_indices(sent, word_dict) for sent in data['Abstract'].split('$$$')]
    if 'Task 1' in data:
      processed['Label'] = [label_to_onehot(label) for label in data['Task 1'].split(' ')]
    formatData.append(processed)
  
  return formatData
  
def label_to_onehot(labels):
  label_dict = {'BACKGROUND': 0, 'OBJECTIVES':1, 'METHODS':2, 'RESULTS':3, 'CONCLUSIONS':4, 'OTHERS':5}
  onehot = [0,0,0,0,0,0]
  for l in labels.split('/'):
    onehot[label_dict[l]] = 1
  return onehot
        
def sentence_to_indices(sentence, word_dict):
  tokenized_text = tokenize(sentence)
  return [tokenizer.convert_tokens_to_ids(tk) for tk in tokenized_text]

In [18]:
print('[INFO] Start processing trainset...')
train = get_dataset(bertpath+'trainset.csv', word_dict)
print('[INFO] Start processing validset...')
valid = get_dataset(bertpath+'validset.csv', word_dict)
print('[INFO] Start processing testset...')
test = get_dataset(bertpath+'testset.csv',word_dict)
# sss = get_dataset(dataPath+'task1_public_testset.csv',word_dict)

[INFO] Start processing trainset...
[INFO] Start processing validset...
[INFO] Start processing testset...


In [19]:
print(len(train))
print(train[0])

6300
{'Abstract': [[101, 1996, 2001, 8043, 8602, 12046, 2030, 3011, 2693, 2099, 1005, 1055, 3292, 1006, 7861, 2094, 1007, 2003, 1037, 6179, 6994, 1999, 6747, 1010, 3698, 4083, 1998, 3274, 2671, 2007, 2116, 5097, 2000, 6897, 2030, 2966, 12126, 1010, 2426, 2500, 1012], [2926, 1999, 1996, 2422, 1997, 6233, 3375, 2951, 1010, 1996, 22334, 1997, 2122, 12103, 3081, 15502, 3665, 2003, 2411, 1996, 14879, 5387, 1012], [4427, 2011, 2023, 4119, 1010, 1037, 3528, 1997, 2047, 8107, 2000, 15502, 3665, 2038, 2042, 3818, 1999, 3522, 2086, 1998, 2247, 2007, 2122, 2047, 4725, 3310, 1996, 2342, 2005, 1037, 15902, 7831, 1012], [1999, 2023, 3259, 1010, 2057, 8970, 1037, 6847, 10665, 2005, 16246, 15502, 3665, 1010, 2170, 11089, 10665, 1010, 2029, 2003, 2881, 2000, 3710, 2004, 1037, 8699, 3074, 1997, 3471, 1010, 2073, 16246, 15502, 3665, 4725, 2064, 2022, 7718, 1010, 4102, 2000, 2028, 2178, 1010, 1998, 2716, 2000, 2037, 6537, 2006, 2312, 1011, 4094, 12107, 1012], [2009, 3774, 1997, 1037, 3528, 1997, 3897, 157

In [20]:
from torch.utils.data import Dataset
import torch
class AbstractDataset(Dataset):
  def __init__(self, data, pad_idx, max_len = 500):
    self.data = data
    self.pad_idx = pad_idx
    self.max_len = max_len
  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    return self.data[index]
      
  def collate_fn(self, datas):
    # get max length in this batch
    max_sent = max([len(data['Abstract']) for data in datas])
    max_len = max([min(len(sentence), self.max_len) for data in datas for sentence in data['Abstract']])
    batch_abstract = []
    batch_label = []
    sent_len = []
    for data in datas:
      # padding abstract to make them in same length
      pad_abstract = []
      for sentence in data['Abstract']:
        if len(sentence) > max_len:
          pad_abstract.append(sentence[:max_len])
        else:
          pad_abstract.append(sentence+[self.pad_idx]*(max_len-len(sentence)))
      sent_len.append(len(pad_abstract))
      pad_abstract.extend([[self.pad_idx]*max_len]*(max_sent-len(pad_abstract)))
      batch_abstract.append(pad_abstract)

      # gather labels
      if 'Label' in data:
          pad_label = data['Label']
          pad_label.extend([[0]*6]*(max_sent-len(pad_label)))
          batch_label.append(pad_label)

    return torch.LongTensor(batch_abstract), torch.FloatTensor(batch_label), sent_len

In [21]:
PAD_TOKEN = tokenizer.pad_token_id
PAD_TOKEN

0

In [24]:
trainData = AbstractDataset(train, PAD_TOKEN, max_len = 64)
validData = AbstractDataset(valid, PAD_TOKEN, max_len = 64)
testData = AbstractDataset(test, PAD_TOKEN, max_len = 64)

**Model**  

In [30]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel

class simpleNet(nn.Module):
  def __init__(self, vocabulary_size):
    super(simpleNet, self).__init__()
    self.embedding_size = 3072
    self.hidden_dim = 512
    self.bert = BertModel.from_pretrained('bert-base-uncased'
                ,output_hidden_states=True)
    self.bert.eval()
    self.sent_rnn = nn.GRU(self.embedding_size,
                            self.hidden_dim,
                            bidirectional=True,
                            num_layers = 2,
                            dropout = 0.5,
                            batch_first=True)
    self.l1 = nn.Linear(self.hidden_dim*2, self.hidden_dim)
    self.l2 = nn.Linear(self.hidden_dim, 6)

  def create_attention_masks(self,ids):
    attention_masks = []

    for id in ids:
      id_mask = [float(i>0) for i in id]
      attention_masks.append(id_mask)
    return torch.tensor(attention_masks)

  def embedding(self,x):
    batched_sents = []
    for sents in x:
      sent_embeddings = []
      attention_masks = self.create_attention_masks(sents)
      attention_masks = attention_masks.to(device)

      with torch.no_grad():
        # print(sents.shape,attention_masks.shape)
        last_hidden_state , _ , hidden_states = self.bert(sents,attention_masks)

        token_embeddings = torch.stack(hidden_states[:-1],dim = 0)

        token_embeddings = token_embeddings.permute(1,2,0,3)

        for tks in token_embeddings:
          token_vecs = []
          for tk in tks:
            cat_vec = torch.cat((tk[-1] , tk[-2] , tk[-3] , tk[-4]) , dim = 0)
            token_vecs.append(cat_vec)
          token_vecs = torch.stack(token_vecs , 0)
          sent_embeddings.append(token_vecs)
        sent_embeddings = torch.stack(sent_embeddings , 0)
        print(sent_embeddings.shape)
      batched_sents.append(sent_embeddings)
    batched_sents = torch.stack(batched_sents , 0)
    return batched_sents

  def forward(self, x):
    print(x.shape)
    # x: (batch,sent,word)
    x = self.embedding(x)
    # print(x.shape)
    # x: (batch,sent,word,feature)
    b,s,w,e = x.shape
    x = x.view(b,s*w,e)
    # x: (batch,sent*word,feature)
    x, __ = self.sent_rnn(x)
    # x: (batch,sent*word,hidden_state*2)
    x = x.view(b,s,w,-1)
    # print(x)
    # x: (batch,sent,word,hidden_state*2)
    x = torch.max(x,dim=2)[0]
    # x: (batch,sent,hidden_state*2)
    x = torch.relu(self.l1(x))
    x = torch.sigmoid(self.l2(x))
    # x: (batch,sent,6)
    return x

In [31]:
device='cuda'

In [32]:
class F1():
  def __init__(self):
    self.threshold = 0.5
    self.n_precision = 0
    self.n_recall = 0
    self.n_corrects = 0
    self.name = 'F1'

  def reset(self):
    self.n_precision = 0
    self.n_recall = 0
    self.n_corrects = 0

  def update(self, predicts, groundTruth):
    predicts = (predicts > self.threshold).float()
    self.n_precision += torch.sum(predicts).data.item()
    self.n_recall += torch.sum(groundTruth).data.item()
    self.n_corrects += torch.sum(groundTruth * predicts).data.item()

  def get_score(self):
    recall = self.n_corrects / self.n_recall
    precision = self.n_corrects / (self.n_precision + 1e-20) #prevent divided by zero
    return 2 * (recall * precision) / (recall + precision + 1e-20)

  def print_score(self):
    score = self.get_score()
    return '{:.5f}'.format(score)


In [33]:
import os
from tqdm import tqdm_notebook as tqdm
def _run_epoch(epoch, training):
  model.train(training)
  if training:
    description = 'Train'
    dataset = trainData
    shuffle = True
  else:
    description = 'Valid'
    dataset = validData
    
    shuffle = False
  dataloader = DataLoader(dataset=dataset,
                          batch_size=4,
                          shuffle=shuffle,
                          collate_fn=dataset.collate_fn,
                          num_workers=4)

  trange = tqdm(enumerate(dataloader), total=len(dataloader), desc=description)
  loss = 0
  f1_score = F1()
  for i, (x, y, sent_len) in trange:
    opt.zero_grad()

    abstract = x.to(device)
    labels = y.to(device)
    o_labels = model(abstract)
    batch_loss = criteria(o_labels, labels)
    break
  #   if training:
  #     batch_loss.backward()
  #     opt.step()

  #   loss += batch_loss.item()
  #   f1_score.update(o_labels.cpu(), y)

  #   trange.set_postfix(
  #     loss=loss / (i + 1), f1=f1_score.print_score())
  # if training:
  #     history['train'].append({'f1':f1_score.get_score(), 'loss':loss/ len(trange)})
  # else:
  #     history['valid'].append({'f1':f1_score.get_score(), 'loss':loss/ len(trange)})


def save(epoch):
  if not os.path.exists(dataPath+'model'):
    os.makedirs(dataPath+'model')
        
  torch.save(model.state_dict(), dataPath+'model/model.pkl.'+str(epoch))
  with open(dataPath+'model/history.json', 'w') as f:
    json.dump(history, f, indent=4) # json.dumps()用於將dict型別的資料轉成str，因為如果直接將dict型別的資料寫入json檔案中會發生報錯，因此在將資料寫入時需要用到該函式。

In [34]:
from torch.utils.data import DataLoader
import json
model = simpleNet(len(word_dict))
opt = torch.optim.AdamW(model.parameters())
criteria = torch.nn.BCELoss()
model.to(device)
max_epoch = 1
history = {'train':[],'valid':[]}

for epoch in range(max_epoch):
  print('Epoch: {}'.format(epoch))
  _run_epoch(epoch, True)
  # _run_epoch(epoch, False)
  # save(epoch)

Epoch: 0


  cpuset_checked))
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


Train:   0%|          | 0/1575 [00:00<?, ?it/s]

torch.Size([4, 8, 64])


TypeError: ignored

**Plot Learning Curve**  

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

with open(dataPath+'model/history.json', 'r') as f:
    history = json.loads(f.read())
    
train_loss = [l['loss'] for l in history['train']]
valid_loss = [l['loss'] for l in history['valid']]
train_f1 = [l['f1'] for l in history['train']]
valid_f1 = [l['f1'] for l in history['valid']]

plt.figure(figsize=(7,5))
plt.title('Loss')
plt.plot(train_loss, label='train')
plt.plot(valid_loss, label='valid')
plt.legend()
plt.show()

plt.figure(figsize=(7,5))
plt.title('F1 Score')
plt.plot(train_f1, label='train')
plt.plot(valid_f1, label='valid')
plt.legend()
plt.show()

print('Best F1 score ', max([[l['f1'], idx] for idx, l in enumerate(history['valid'])]))

**Predict**  

In [None]:
model.train(False)
dataloader = DataLoader(dataset=testData,
                            batch_size=64,
                            shuffle=False,
                            collate_fn=testData.collate_fn,
                            num_workers=4)
trange = tqdm(enumerate(dataloader), total=len(dataloader), desc='Predict')
prediction = []
for i, (x , y , sent_len) in trange:
  o_labels = model(x.to(device))
  o_labels = o_labels>0.5
  for idx, o_label in enumerate(o_labels):
    prediction.append(o_label[:sent_len[idx]].to('cpu'))
prediction = torch.cat(prediction).detach().numpy().astype(int)

In [None]:
"""
Public = True if you're predicting public test data.
"""
import numpy as np
def SubmitGenerator(prediction, sampleFile, public=True, filename='prediction.csv'):
    sample = pd.read_csv(sampleFile)
    submit = {}
    submit['order_id'] = list(sample.order_id.values)
    redundant = len(sample) - prediction.shape[0]
    if public:
      submit['BACKGROUND'] = list(prediction[:,0]) + [0]*redundant
      submit['OBJECTIVES'] = list(prediction[:,1]) + [0]*redundant
      submit['METHODS'] = list(prediction[:,2]) + [0]*redundant
      submit['RESULTS'] = list(prediction[:,3]) + [0]*redundant
      submit['CONCLUSIONS'] = list(prediction[:,4]) + [0]*redundant
      submit['OTHERS'] = list(prediction[:,5]) + [0]*redundant
    else:
      submit['BACKGROUND'] = [0]*redundant + list(prediction[:,0])
      submit['OBJECTIVES'] = [0]*redundant + list(prediction[:,1])
      submit['METHODS'] = [0]*redundant + list(prediction[:,2])
      submit['RESULTS'] = [0]*redundant + list(prediction[:,3])
      submit['CONCLUSIONS'] = [0]*redundant + list(prediction[:,4])
      submit['OTHERS'] = [0]*redundant + list(prediction[:,5])
    df = pd.DataFrame.from_dict(submit) 
    df.to_csv(filename,index=False)

In [None]:
SubmitGenerator(prediction,dataPath+'task1_sample_submission.csv',False, dataPath+'Bertdropout03ep10AdamW_.csv')