In [59]:
import pandas as pd

data = pd.read_csv('raw/in_domain_dev.tsv', 
                                sep='\t', 
                                names = ["x", "label", "y", "sentence"], 
                                header= None
)[['label', 'sentence']]

In [72]:
import torch
import pandas as pd
from torch.utils.data import Dataset

class CoLADataset(Dataset):
    def __init__(self, path, tokenizer):
        self.data = pd.read_csv(path, 
                                sep='\t', 
                                names = ["x", "label", "y", "sentence"], 
                                header= None
        )[['label', 'sentence']]
        self.tokenizer = tokenizer

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        data_pt = self.data.iloc[idx]

        tokenized_sentence = self.tokenizer(data_pt['sentence'], return_tensors= 'pt')
        label = torch.tensor(data_pt["label"])

        return {
            "label": label, 
            "sentence": tokenized_sentence
        }

dataset = CoLADataset('raw/in_domain_dev.tsv', tokenizer)

In [73]:
dataset[2]['sentence']

{'input_ids': tensor([[  101,  1996,  6228, 10658, 23277,  8004, 11533,  2993,  6065,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [85]:
model(**dataset[2]['sentence'], output_hidden_states = True).hidden_states[-1]

tensor([[[ 0.0756, -0.0985,  0.0128,  ..., -0.0843,  0.0293,  0.6922],
         [ 0.0442,  0.0417, -0.4450,  ...,  0.2266,  0.8602, -0.0695],
         [ 0.7089,  0.3831, -0.0235,  ..., -0.2247,  0.8736,  0.1793],
         ...,
         [ 0.2031, -0.1015,  0.5362,  ...,  0.2164, -0.1809,  0.0601],
         [ 0.9231,  0.3736, -0.4517,  ...,  0.3121, -0.2383, -0.4303],
         [ 0.7308,  0.5111, -0.1045,  ...,  0.5012, -0.4107, -0.2019]]],
       grad_fn=<NativeLayerNormBackward0>)

In [66]:
from transformers import BertModel

In [67]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [41]:
for i in model.encoder.children():
    for j in i.children():
        attn = j.attention
        inter = j.intermediate
        output = j.output

        attn = attn.output
        layers = list(attn.children())
        layers.insert(1, AdapterModule)
        print(layers)
        break
    break

[Linear(in_features=768, out_features=768, bias=True), <class 'model.AdapterModule'>, LayerNorm((768,), eps=1e-12, elementwise_affine=True), Dropout(p=0.1, inplace=False)]


In [39]:
from model import AdapterModule

In [42]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:01<00:00, 227kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 10.9kB/s]


In [63]:
tokenizer('This is good')

{'input_ids': tensor([[ 101, 2023, 2003, 2204,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}