# Libraries

In [11]:
# 引入基本資料處理用函式庫
import numpy as np
import pandas as pd
import os
import warnings
import random

# 引入 Pytorch 函式庫, 神經網路函式庫, Optimizer優化器 Loss function是要幫助我們判斷誤差值的，而Optimizer是要調整參數，來使Loss越小越好。
import torch 
from torch import nn
import torch.optim as optim

# 資料集分割器, 供多重驗證模型使用
from sklearn.model_selection import StratifiedKFold

# 引入單字,單詞分割器
import tokenizers
# 引入主要模型, RoBERTa (Robustly optimized BERT approach)
from transformers import RobertaModel, RobertaConfig

warnings.filterwarnings('ignore')

# Seed

In [12]:
'''
此區塊主要用於調整所有用到的函式庫使用同一個種子碼，
確保程式及訓練過程及結果可以重現。確保亂數的值固定
'''
def seed_everything(seed_value):
    #調整 random, numpy, pytorch, python本體 的種子碼
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    # 若有 GPU 版本 Pytorch 可使用
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

# 設定種子碼為 42
seed = 42
seed_everything(seed)

# Data Loader

關於 `tokenizers.ByteLevelBPETokenizer` 可以參考 [網址](https://github.com/huggingface/tokenizers/blob/master/bindings/python/py_src/tokenizers/implementations/byte_level_bpe.py) <br>
關於 `encoding.offsets` 可以參考 [網址](https://huggingface.co/docs/tokenizers/python/v0.10.0/api/reference.html?highlight=offsets#tokenizers.Encoding.offsets) <br>
關於 `encoding.ids` 可以參考 [網址](https://huggingface.co/docs/tokenizers/python/v0.10.0/api/reference.html?highlight=offsets#tokenizers.Encoding.ids) <br>
關於 `torch.utils.data.DataLoader` 可以參考 [網址](https://pytorch.org/docs/stable/data.html)

In [13]:
from transformers import RobertaTokenizerFast

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=96):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df

        # fast-токенизатор, чтобы был return_offsets_mapping
        self.tokenizer = RobertaTokenizerFast.from_pretrained(
            "roberta-base",
            add_prefix_space=True,
        )

    # 賦予此 Class 用 index 取值的能力， e.g. TweetDataset[1]
    def __getitem__(self, index):
        # 建立空的 dictionary
        data = {}
        # iloc:用index位置來取我們要的資料
        row = self.df.iloc[index] 
        # 使用 class 函式 get_input_data 根據 index row 取值且放入剛剛的 data dictionary
        ids, masks, tweet, offsets = self.get_input_data(row)
        data['ids'] = ids
        data['masks'] = masks #，由於 padding 會替不等長的句子們補0 ， 這時候利用masks就可以標註出非 0 的區域，也就是讓模型不被 padding 補的 0 影響判斷。
        data['tweet'] = tweet
        data['offsets'] = offsets #是一個表示 該單詞於句子的起始位置 結束位置的元組
        
        # 若 labeled 不為空集合則執行
        if self.labeled:
            # 使用 class 函式 get_target_idx, 額外針對目標取出 start_idx, end_idx 
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
            
        # 回傳 data dictionary
        return data
    
    # 定義針對此 class 呼叫 python 內建函式 len 的時候的回傳值
    def __len__(self):
        return len(self.df)
    
    # 傳入一列資料，回傳 ids, masks, tweet, offsets 四個變數
    def get_input_data(self, row): 
        # как раньше
        tweet = " " + " ".join(row.text.lower().split())

        # токенизируем сам твит БЕЗ спец-токенов, чтобы offset'ы соответствовали строке
        encoding = self.tokenizer(
            tweet,
            add_special_tokens=False,
            return_offsets_mapping=True,
            truncation=False,   # обрежем сами ниже, чтобы учесть спец-токены
        )

        tweet_ids = encoding["input_ids"]
        tweet_offsets = encoding["offset_mapping"]

        # токенизируем sentiment (positive / negative / neutral)
        sent_enc = self.tokenizer(
            row.sentiment,
            add_special_tokens=False
        )
        sentiment_ids = sent_enc["input_ids"]

        # ручная сборка input_ids, как в исходном коде
        # у RoBERTa: <s>=0, </s>=2, pad=1
        ids = [0] + sentiment_ids + [2, 2] + tweet_ids + [2]

        # offsets: на всё до твита ставим заглушки (0,0)
        prefix_len = 1 + len(sentiment_ids) + 2  # <s> + sentiment + </s></s>
        offsets = [(0, 0)] * prefix_len + list(tweet_offsets) + [(0, 0)]

        # если слишком длинно — обрежем
        if len(ids) > self.max_len:
            ids = ids[:self.max_len]
            offsets = offsets[:self.max_len]

        # паддинг, если коротко
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [1] * pad_len           # pad token id = 1
            offsets += [(0, 0)] * pad_len

        ids = torch.tensor(ids)
        masks = torch.where(ids != 1, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)

        return ids, masks, tweet, offsets
    
    '''
    此資料集的目標是指出該列 Text 能夠判斷語氣的部份, 
    放置於 train 資料集的 selected_text 欄位
    '''
    def get_target_idx(self, row, tweet, offsets):
        # 同上 text 處理方法
        selected_text = " " +  " ".join(row.selected_text.lower().split())
        
        # 取出 selected_text 的長度
        len_st = len(selected_text) - 1
        # 建立 text 之 index 用 #?
        idx0 = None
        idx1 = None
        

        # 在 e == selected_text[1] , 也就是與 selected_text 開頭的單詞相同的句子的集合內  enumerate=利用它可以同時獲得索引和值
        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            # 若 " " + tweet[ind: ind+len_st] 的組合 和 selected_text 一樣
            if " " + tweet[ind: ind+len_st] == selected_text:
                # 設定 idx0 為起始點, idx1 為終止點
                idx0 = ind
                idx1 = ind + len_st - 1
                break
        
        # 先以 len(tweet) 個 [0] 初始化 char_targets
        char_targets = [0] * len(tweet)
        # 若有成功取出 idx0 及 idx1
        if idx0 != None and idx1 != None:
            # 將 char_targets 對應 tweet 的 selected_text 位置 (idx0 ~ idx1 的範圍) 設為 1
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        # 藉 offset 製造 target_idx 做訓練使用
        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            # 若有發現 char_targets 中 範圍 offset1 至 offset2 的和大於 0 (代表有值)，
            # 則將其 index 放入 target_idx
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        # 起始 idx 為 target_idx 中第一個，終止 idx 則為最後一個
        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx

'''
傳入 dataframe, 分割後之 train 及 val 對應的 idx, 及預設為 8 的 batch_size
回傳有 train 及 val DataLoader 的 dictionary
'''
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    # 藉 train_idx 及 val_idx 將 dataframe 分割成訓練及驗證 dataframe
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    
    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True,  # 打亂排序 
        num_workers=2, # 以兩個 子行程處理
        drop_last=True) # 當資料集 batch 無法均分時，捨棄最後一個不完整的 batch

    # 要注意不要打亂排序避免 idx 錯亂
    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)
    
    # 用 dict 儲存兩個 Loader, 並且加上對應的 Key
    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

'''
傳入 dataframe, 及預設為 32 的 batch_size
回傳 test 資料集使用的 Loader 
'''
def get_test_loader(df, batch_size=8):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, # 找出答案用, 所以不打亂順序
        num_workers=2)  # 以兩個 子行程 處理    
    return loader

# Model
Transformers 的 RoBERTa 相關可以參考  [網址](https://huggingface.co/transformers/model_doc/roberta.html) <br>
Config 可以參考 [roberta-base/config.json](https://huggingface.co/roberta-base/resolve/main/config.json)

In [14]:
class TweetModel(nn.Module):
    def __init__(self):
        super(TweetModel, self).__init__()
        
        # грузим конфиг и включаем hidden_states
        config = RobertaConfig.from_pretrained("roberta-base")
        config.output_hidden_states = True

        # сама модель
        self.roberta = RobertaModel.from_pretrained("roberta-base", config=config)

        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0.0)

    def forward(self, input_ids, attention_mask):
        # вызываем модель с именованными аргументами
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # tuple тензоров: (embeddings, layer1, ..., layer12)
        hidden_states = outputs.hidden_states

        # берём последние 4 слоя и стакаем по новой оси: (4, batch, seq_len, hidden)
        x = torch.stack(hidden_states[-4:], dim=0)
        # среднее по этим 4 слоям → (batch, seq_len, hidden)
        x = x.mean(dim=0)

        x = self.dropout(x)
        x = self.fc(x)  # (batch, seq_len, 2)

        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        
        return start_logits, end_logits

# Loss Function

In [15]:
'''
建立 Loss Function 供訓練使用，
基底是 CrossEntropy，但在此必須同時比對開頭位置及結束位置 ，CrossEntropy是在觀測預測的機率分佈與實際機率分布的誤差範圍
所以程式將兩個的 CrossEntopyLoss 加起來計算。
'''
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

# Evaluation Function
Jaccard index 可參考 [網址](https://zh.wikipedia.org/wiki/雅卡尔指数)

In [16]:
# 藉 start_idx, end_idx, offsets 取出 test 中的 selected_text
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        # 先取出指定範圍
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        # 確認是否需要加上空白做辨識
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

# 建立 evaluation function - Jaccard index, 又稱Intersection over Union=一種測量在特定資料集中檢測相應物體準確度的一個標準
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    # 取聯集分之交集
    return float(len(c)) / (len(a) + len(b) - len(c))

# 計算 jaccard_score
def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    # 取出 機率最大的位置
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    
    # 此區取出預測區段文字，第一個條件判斷出有可能是整句文字的狀況
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
    
    # 取出正確對應語氣的文字
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    # 計算 jaccard_score
    return jaccard(true, pred)

# Training Function

In [17]:
from transformers import RobertaConfig, RobertaModel
from tqdm.notebook import tqdm

In [18]:
'''
訓練模型使用， 引入 Model, 訓練及驗證 dataloader, loss function , optimizer, 訓練回數, 檔案名稱
最後會儲存訓練後的模型。
'''
def train_model(model, dataloaders_dict, criterion, optimizer, num_epochs, filename):
    # 使用 GPU
    model.cuda()

    # 根據訓練回數，每回訓練進行...
    for epoch in tqdm(range(num_epochs)):
        # 判斷當前階段
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            # 預設 loss 及 jaccard 為 0
            epoch_loss = 0.0
            epoch_jaccard = 0.0
            
            # 取出當前階段(train 或 val) 所使用的資料集，資料若是 torch tensor，在 GPU 訓練要轉成 GPU 使用的 Tesnor
            for data in tqdm((dataloaders_dict[phase])):
                ids = data['ids'].cuda()
                masks = data['masks'].cuda()
                tweet = data['tweet']
                offsets = data['offsets'].numpy()
                start_idx = data['start_idx'].cuda()
                end_idx = data['end_idx'].cuda()
                
                # 初始化 optimizer
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    
                    # 輸入 ids, masks 得到 model 輸出
                    start_logits, end_logits = model(ids, masks)
                    # 計算 loss
                    loss = criterion(start_logits, end_logits, start_idx, end_idx)
                    
                    # 在訓練階段要反向傳播且讓 optimizer 進行梯度下降
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
                    # 計算各批訓練 loss 之總和，loss.item() 目的在於將 loss 取出成 python float 形式
                    epoch_loss += loss.item() * len(ids)
                    
                    # 以下步驟目的在於將 tensor 從 gpu 拿回 cpu 並且轉成 numpy array
                    # .cpu() 用於將 tensor 放回 cpu
                    # .detach() 用於阻斷反向傳播
                    # .numpy() 將 tensor 轉為 numpy array
                    start_idx = start_idx.cpu().detach().numpy()
                    end_idx = end_idx.cpu().detach().numpy()
                    start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
                    end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
                    
                    # 計算本回的總 jaccard 分數總合
                    for i in range(len(ids)):                        
                        jaccard_score = compute_jaccard_score(
                            tweet[i],
                            start_idx[i],
                            end_idx[i],
                            start_logits[i], 
                            end_logits[i], 
                            offsets[i])
                        epoch_jaccard += jaccard_score
            
            # 平均 loss 及 jaccard
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard / len(dataloaders_dict[phase].dataset)
            
            # 印出當前 Loss 及 jaccard
            print('Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format(
                epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard))
            
    # 儲存模型
    torch.save(model.state_dict(), filename)

# Training

In [21]:
# 定義訓練過程中數據將被輪3次
num_epochs = 3
# 每次批量訓練數量為 32
batch_size = 128
# 建立 KFold 多重驗證訓練器，分十種資料集分布且要打亂排序
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [22]:
%%time

# 讀入訓練用 csv
train_df = pd.read_csv('data/train_data.csv')
# 將 text 內容轉型為 string
train_df['text'] = train_df['text'].astype(str)
# 將 selected_text 內容轉型為 string
train_df['selected_text'] = train_df['selected_text'].astype(str)

# 將資料集以十種分布反覆進行訓練及驗證
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): 
    print(f'Fold: {fold}')
    # 每種資料集分布都會建立一個新 model
    model = TweetModel()
    # 使用 AdamW 為 optimizer, 學習率 3e-5, betas 分別為 0.9 及 0.999
    optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999))
    # 呼叫 loss function
    criterion = loss_fn
    # 根據 train_idx 及 val_idx 的不同重新建立 data loader
    dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, batch_size)
    
    # 呼叫模型進行訓練，儲存的 Model 名字為 (f'roberta_fold{fold}.pth')
    train_model(
        model, 
        dataloaders_dict,
        criterion, 
        optimizer, 
        num_epochs,
        f'roberta_fold{fold}.pth')

Fold: 1


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/3 | train | Loss: 2.6985 | Jaccard: 0.6050


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 1/3 |  val  | Loss: 1.6831 | Jaccard: 0.7094


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/3 | train | Loss: 1.7435 | Jaccard: 0.7010


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 2/3 |  val  | Loss: 1.6295 | Jaccard: 0.7141


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/3 | train | Loss: 1.5990 | Jaccard: 0.7158


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 3/3 |  val  | Loss: 1.5922 | Jaccard: 0.7273
Fold: 2


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/3 | train | Loss: 2.6710 | Jaccard: 0.6143


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 1/3 |  val  | Loss: 1.7219 | Jaccard: 0.7066


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/3 | train | Loss: 1.6788 | Jaccard: 0.7093


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 2/3 |  val  | Loss: 1.6100 | Jaccard: 0.7101


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/3 | train | Loss: 1.5354 | Jaccard: 0.7253


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 3/3 |  val  | Loss: 1.5868 | Jaccard: 0.7097
Fold: 3


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/3 | train | Loss: 2.5829 | Jaccard: 0.6255


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 1/3 |  val  | Loss: 1.6889 | Jaccard: 0.7179


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/3 | train | Loss: 1.6743 | Jaccard: 0.7088


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 2/3 |  val  | Loss: 1.6408 | Jaccard: 0.7140


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/3 | train | Loss: 1.5477 | Jaccard: 0.7248


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 3/3 |  val  | Loss: 1.6738 | Jaccard: 0.7147
Fold: 4


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/3 | train | Loss: 2.5432 | Jaccard: 0.6285


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 1/3 |  val  | Loss: 1.7012 | Jaccard: 0.6994


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/3 | train | Loss: 1.6462 | Jaccard: 0.7109


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 2/3 |  val  | Loss: 1.6400 | Jaccard: 0.7125


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/3 | train | Loss: 1.5186 | Jaccard: 0.7277


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 3/3 |  val  | Loss: 1.6197 | Jaccard: 0.7107
Fold: 5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/3 | train | Loss: 2.6955 | Jaccard: 0.6197


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 1/3 |  val  | Loss: 1.8246 | Jaccard: 0.6942


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/3 | train | Loss: 1.7404 | Jaccard: 0.7055


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 2/3 |  val  | Loss: 1.6571 | Jaccard: 0.7159


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/3 | train | Loss: 1.5928 | Jaccard: 0.7211


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 3/3 |  val  | Loss: 1.6068 | Jaccard: 0.7160
Fold: 6


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/3 | train | Loss: 2.6066 | Jaccard: 0.6188


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 1/3 |  val  | Loss: 1.6648 | Jaccard: 0.7112


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/3 | train | Loss: 1.6514 | Jaccard: 0.7118


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 2/3 |  val  | Loss: 1.5604 | Jaccard: 0.7352


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/3 | train | Loss: 1.5260 | Jaccard: 0.7260


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 3/3 |  val  | Loss: 1.5634 | Jaccard: 0.7326
Fold: 7


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/3 | train | Loss: 2.6380 | Jaccard: 0.6155


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 1/3 |  val  | Loss: 1.7236 | Jaccard: 0.7038


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/3 | train | Loss: 1.6642 | Jaccard: 0.7105


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 2/3 |  val  | Loss: 1.6411 | Jaccard: 0.7101


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/3 | train | Loss: 1.5411 | Jaccard: 0.7262


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 3/3 |  val  | Loss: 1.6238 | Jaccard: 0.7194
Fold: 8


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/3 | train | Loss: 2.5538 | Jaccard: 0.6305


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 1/3 |  val  | Loss: 1.6807 | Jaccard: 0.7043


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/3 | train | Loss: 1.6471 | Jaccard: 0.7138


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 2/3 |  val  | Loss: 1.6138 | Jaccard: 0.7156


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/3 | train | Loss: 1.5237 | Jaccard: 0.7258


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 3/3 |  val  | Loss: 1.6178 | Jaccard: 0.7173
Fold: 9


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/3 | train | Loss: 2.7689 | Jaccard: 0.6109


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 1/3 |  val  | Loss: 1.7362 | Jaccard: 0.7040


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/3 | train | Loss: 1.7400 | Jaccard: 0.7043


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 2/3 |  val  | Loss: 1.6275 | Jaccard: 0.7175


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/3 | train | Loss: 1.6047 | Jaccard: 0.7193


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 3/3 |  val  | Loss: 1.6482 | Jaccard: 0.7283
Fold: 10


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 1/3 | train | Loss: 2.6635 | Jaccard: 0.6145


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 1/3 |  val  | Loss: 1.6290 | Jaccard: 0.7146


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 2/3 | train | Loss: 1.6782 | Jaccard: 0.7081


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 2/3 |  val  | Loss: 1.5603 | Jaccard: 0.7271


  0%|          | 0/195 [00:00<?, ?it/s]

Epoch 3/3 | train | Loss: 1.5403 | Jaccard: 0.7264


  0%|          | 0/22 [00:00<?, ?it/s]

Epoch 3/3 |  val  | Loss: 1.5444 | Jaccard: 0.7324
CPU times: user 38min 16s, sys: 8 s, total: 38min 24s
Wall time: 38min 44s


# Inference

In [10]:
%%time

# 讀入測試(輸出答案)用 csv
test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
# 將 text 內容轉型為 string
test_df['text'] = test_df['text'].astype(str)
# 取得 test 用 dataloader
test_loader = get_test_loader(test_df)

# 初始化
predictions = []
models = []

# 讀出每個 fold 訓練出的 Model 並且放到 models 中
for fold in range(skf.n_splits):
    model = TweetModel()
    model.cuda()
    model.load_state_dict(torch.load(f'roberta_fold{fold+1}.pth'))
    model.eval()
    models.append(model)

for data in test_loader:
    #資料若是 torch tensor，在 CPU 用要轉成 GPU 使用的 Tesnor
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()

    start_logits = []
    end_logits = []
    # 運算出每個 fold 訓練下的輸出結果，並且放回 cpu，阻斷反向傳播，再轉成 numpy array
    for model in models:
        with torch.no_grad():
            output = model(ids, masks)
            start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
            end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())
    # 沿著維度 0 號取平均
    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        # 取出預測區段文字，有可能是整句
        if start_pred > end_pred:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        # 放入 predictions
        predictions.append(pred)

CPU times: user 2min 39s, sys: 6.19 s, total: 2min 45s
Wall time: 2min 46s


# Submission

In [11]:
# 讀入 submission 參考格式
sub_df = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
# 在答案區塊放入預測值
sub_df['selected_text'] = predictions
# 將語氣輔助詞縮短
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
# 將繳交答案用 dataframe 存成 csv, 不額外建立 index
sub_df.to_csv('submission.csv', index=False)
# 檢查用
sub_df.head()

Unnamed: 0,textID,selected_text
0,f87dea47db,last session of the day
1,96d74cb729,exciting
2,eee518ae67,such a shame!
3,01082688c6,happy bday!
4,33987a8ee5,i like it!!
