In [28]:
from dataset import MyDataset
import numpy as np
import matplotlib.pyplot as plt
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
table = pd.read_csv('data/power/power-hr-train.tsv', sep='\t')

In [27]:
table

Unnamed: 0,id,speaker,sex,text,text_en,label
0,hr10338,accc1a779eb3de6e72a1918eae210a7c,M,"Gospodine predsjedniče, uvaženi kolega zastupn...","Mr. President, distinguished counterpart Leko ...",1
1,hr10339,9406c621a816cc966509965a4c0bc023,M,"Poštovani gospodine predsjedniče, kolegice i k...","Dear Mr. President, colleagues and colleagues....",0
2,hr10340,0b6bca6455fd10bdb7490e7bddfe0ca2,M,"Štovani gospodine potpredsjedniče, kolegice i ...","Honored Mr. Vice President, colleagues and coe...",0
3,hr10341,7c65f68cbec0f780f8878ee435e08820,M,"Cijenjeni predsjedniče, cijenjene dame i gospo...","The esteemed President, the esteemed ladies an...",1
4,hr10342,accc1a779eb3de6e72a1918eae210a7c,M,Hvala lijepo gospodine predsjedniče Hrvatskoga...,"Thank you very much, Mr. President of the Croa...",1
...,...,...,...,...,...,...
10736,hr21074,4f3f32bac32e23425cef32dc5f39a468,M,Pa evo iako sa dosta dobrim dijelom ovog se sl...,"Well, here's the thing, even though with a pre...",0
10737,hr21075,9687420c53b3ae0fdf1ef0ae0f5e6a69,M,"Poštovani potpredsjedniče <PARTY>-a, poštovane...","Dear vice president of <PARTY>, respected coll...",1
10738,hr21076,0ab354e5a1a18c9585c59347c401c64f,M,Hvala lijepa g. potpredsjedniče. Poštovani g. ...,"Thank you very much, Mr. Vice President. Mr Se...",1
10739,hr21077,69cc60ac1e142a1533592406409e506d,M,"Hvala lijepo. Uvaženi predsjedniče <PARTY>-a, ...",Thank you very much. The Honorable President o...,0


In [32]:
model = AutoModel.from_pretrained('distilbert/distilbert-base-uncased-finetuned-sst-2-english')

In [33]:
model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [34]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")



In [35]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [36]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")




In [56]:
input = tokenizer("Hello, my dog is cute", return_tensors="pt")
input['input_ids'].shape

torch.Size([1, 8])

In [53]:
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
import torch
from typing import List, Union
from transformers import AutoTokenizer, AutoModel

class MyDataset(Dataset):
    def __init__(self, 
                ids: List[str], 
                speakers: List[str], 
                sexes: List[str], 
                texts: List[str], 
                texts_en: List[str], 
                labels: List[bool],
                device: torch.device = torch.device('cpu'),
                model_name: str = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                max_length: int = 512
        ):
        assert len(ids) == len(speakers) == len(sexes) == len(texts) == len(texts_en) == len(labels)
        self.ids = ids
        self.speakers = []
        self.sexes = []
        self.texts = []
        self.texts_en = []
        self.embeddings = []
        self.labels = []
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, max_length=max_length)

        for i in range(len(ids)):
            text = texts[i]
            inputs = self.tokenizer(text, return_tensors='pt')
            if inputs['input_ids'].shape[1] <= max_length:
                self.ids.append(ids[i])
                self.speakers.append(speakers[i])
                self.sexes.append(sexes[i])
                self.texts.append(texts[i])
                self.texts_en.append(texts_en[i])
                self.embeddings.append(inputs['input_ids'])
                self.labels.append(labels[i])
                
        print(f'Loaded {len(self.ids)}/{len(ids)} samples.')

    def __getitem__(self, index):
        return self.ids[index], self.speakers[index], self.sexes[index], self.texts[index], \
                self.texts_en[index], self.embeddings[index].to(self.device), self.labels[index]
            
    def __len__(self):
        return len(self.ids)

    def set_device(self, device: torch.device):
        '''
        Sets the device to the given device.
        '''
        self.device = device

In [54]:
table
ids = table['id'].tolist()
speakers = table['speaker'].tolist()
sexes = table['sex'].tolist()
texts = table['text'].tolist()
texts_en = table['text_en'].tolist()
labels = table['label'].tolist()

In [55]:
dataset = MyDataset(ids, speakers, sexes, texts, texts_en, labels)

Token indices sequence length is longer than the specified maximum sequence length for this model (1462 > 512). Running this sequence through the model will result in indexing errors


Loaded 14306/14306 samples.


In [57]:
dataset[3]

('hr10341',
 '064e8aa666f3d2c35a3a475103873041',
 'M',
 'Poštovani gospodine predsjedniče Hrvatskog sabora, poštovani gospodine predsjedniče hrvatske Vlade, dame i gospodo. Ja sam htio svoje pitanje uputiti premijeru gospodinu dr. Ivi Sanaderu ali pošto je on odgovorio na moje pitanje djelomično u odgovoru na pitanje uvaženog zastupnika Bagarića i ja sam isto mislio postaviti pitanje u vezi Statuta grada <PARTY>ara. Pa samo bih molio da mi da svoj ...  \n\t\t\t\t\t\tkomentar na rezultat, na rezultat izbora koji su bili iz referenduma u <PARTY>aru gdje je 99% kućanstva izrazilo želju za jednim gradom i jednom općinom. Hvala lijepa.',
 "Dear Mr. President of the Croatian Parliament, Mr. President of the Croatian Government, ladies and gentlemen. I wanted to refer my question to the Prime Minister Dr. Evey Sanader, but since he answered my question partly in response to the question of the respected representative Bagaric, I also thought to ask a question about the Statute of <PARTY>ar. S

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")