In [1]:
from torch.utils.data import Dataset
import numpy as np
import matplotlib.pyplot as plt
import torch
from typing import List, Union
from transformers import AutoTokenizer, AutoModel

class MyDataset(Dataset):
    def __init__(self, 
                ids: List[str], 
                speakers: List[str], 
                sexes: List[str], 
                texts: List[str], 
                texts_en: List[str], 
                labels: List[bool],
                device: torch.device = torch.device('cpu'),
                model_name: str = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                max_length: int = 512
        ):
        assert len(ids) == len(speakers) == len(sexes) == len(texts) == len(texts_en) == len(labels)
        self.ids = []
        self.speakers = []
        self.sexes = []
        self.texts = []
        self.texts_en = []
        self.embeddings = []
        self.attention_masks = []
        self.labels = []
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        for i in range(len(ids)):
            text = texts[i]
            inputs = self.tokenizer(text, add_special_tokens=True, return_tensors='pt', padding='max_length',max_length=max_length)
            if inputs['input_ids'].shape[1] <= max_length:
                self.ids.append(ids[i])
                self.speakers.append(speakers[i])
                self.sexes.append(sexes[i])
                self.texts.append(texts[i])
                self.texts_en.append(texts_en[i])
                self.embeddings.append(inputs['input_ids'][0])
                self.attention_masks.append(inputs['attention_mask'])
                self.labels.append(torch.tensor((labels[i]), dtype=torch.long))
                
        print(f'Loaded {len(self.ids)}/{len(ids)} samples.')

    def __getitem__(self, index):
        return self.ids[index], self.speakers[index], self.sexes[index], self.texts[index], \
                self.texts_en[index], self.embeddings[index][:512].to(self.device), self.attention_masks[index][0][:512].to(self.device), self.labels[index]
            
    def __len__(self):
        return len(self.ids)

    def set_device(self, device: torch.device):
        '''
        Sets the device to the given device.
        '''
        self.device = device

In [None]:
import types
my_module = types.ModuleType('dataset', 'jerko')
my_module.__dict__.update({'MyDataset': MyDataset})
import sys
sys.modules['dataset'] = my_module
#MyDataset.__module__ = 'dataset'
dataset_train = torch.load('/kaggle/input/political-orientation-short/train_dataset_all.pt')
dataset_valid = torch.load('/kaggle/input/political-orientation-short-en/train_dataset_all.pt')

In [None]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
import torch
from typing import List, Union
from transformers import AutoTokenizer, AutoModel, PreTrainedModel
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, BertForSequenceClassification
import pandas as pd

from sklearn.metrics import accuracy_score, confusion_matrix

def evaluate(dataset: Dataset, model: PreTrainedModel, device: torch.device = torch.device('cpu'), plot: bool = False):
    '''
    Evaluates the model on the given dataset.
    
    Parameters:
        dataset: Dataset
            The dataset to evaluate on.
        model: PreTrainedModel
            The model to evaluate.
        device: torch.device
            The device to use.
        plot: bool
    '''
    #model.to(device)
    model.eval()
    loader = DataLoader(dataset, batch_size=16, shuffle=False)
    correct_labels = []
    model_predictions = []
    probs = []
    attentions = []
    embeddings = []
    texts = []
    with torch.no_grad():
        for batch in loader:
            id_, speaker, sex, text, text_en, embedding, attention_mask, label = batch
            texts.extend(text_en)
            embedding = embedding.to(device)
            attention_mask = attention_mask.to(device).squeeze(1)
            assert(attention_mask.nonzero().size() == embedding.nonzero().size())
            label = label.to(device)
            model_output = model(input_ids=embedding, labels=label, attention_mask=attention_mask, output_attentions=True)
            embeddings.extend(embedding.cpu())
            
            
            attention = torch.mean(model_output.attentions[-1], dim=1).squeeze()[:,0]
           
           
            attentions.extend(attention.cpu().numpy())
            logits = model_output.logits
            
            prob = torch.max(torch.softmax(logits, dim=1), dim=1)
            
            probs.extend(prob.values.cpu())
            predictions = torch.argmax(logits, dim=1)
            correct_labels.extend(label.cpu().numpy())
            model_predictions.extend(predictions.cpu().numpy())

    accuracy = accuracy_score(correct_labels, model_predictions)
    print(f'Accuracy: {accuracy}')
    print(f'Confusion matrix:\n{confusion_matrix(correct_labels, model_predictions)}')
    
    return correct_labels, model_predictions, probs, attentions, embeddings, texts




In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = torch.load('/kaggle/input/ideology_bert_en/pytorch/1/1/jerko_batica_cased_en.pt')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:

output = evaluate(dataset_valid,model, 'cuda:0')



Accuracy: 0.9611919391296043
Confusion matrix:
[[12547   210]
 [ 1009 17645]]


In [7]:
labels, predictions, probs, attentions, embeddings, old_texts = output

In [8]:
labels = np.array(labels)
predictions = np.array(predictions)
probs = np.array(probs)
attentions = np.array(attentions)
embeddings = np.array(embeddings)
print(attentions.shape)
print(attentions.reshape(len(labels),-1).shape)

(31411, 512)
(31411, 512)


In [9]:
ind = labels == predictions
print(len(ind))
labels = labels[ind]
predictions = predictions[ind]
probs = probs[ind]
attentions = attentions[ind]
embeddings = embeddings[ind]
texts = []
for i, cond in enumerate(ind):
    if cond:
        texts.append(old_texts[i])
        

31411


In [17]:
most_left = torch.topk(torch.tensor(probs), 5000, largest=False).indices.numpy()
good = most_left[6]
most_important = torch.topk( torch.tensor(attentions[good]), 15).indices.numpy()

print(tokenizer.decode(embeddings[good][most_important]))
for ind in most_left[6:10]:
    print(texts[ind])
most_important = torch.topk( torch.tensor(attentions[most_left]), 5).indices.numpy()
print(most_important.shape)
print(most_left.shape)

counter_l = {}
for ind, att in zip(most_left, most_important):
    words = tokenizer.decode(embeddings[ind][att]).lower().split(' ')
    for word in words:
        if '[SEP]' in word:
            word = word.replace('[SEP]', '')
        if word not in counter_l:
            counter_l[word] = 1
        else:
            counter_l[word] += 1
            
print
most_popular_left = list(map(lambda x: x,sorted(counter_l.items(), key=lambda x: -x[1])))

print(most_popular_left[:100])




##gration here immigrantsam worker Mad [SEP] labour, Mi Recently will want system people
Madam President, I would like Recently, we have been able to read many stories about labour immigrants who are to be expelled because of small mistakes, which have often been made by previous employers, and things that have happened several years ago. <p> This is basically a new practice from the Migration Court, which now means that thousands of people will have their cases tried in a completely new way. It is about people who behave themselves, pay taxes and contribute to society, people who just want to live their lives here. <p> Abuse of the system should, of course, be prosecuted and countered, but this is not about abuse. These are small mistakes, and there is no way whatsoever of correcting them afterwards. This affects the worker very hard.
3. To ask the hon. Member for Perth and North Perthshire, representing the House of Commons Commission, whether the Commission plans to further restrict

In [None]:
most_right = torch.topk(torch.tensor(probs), 5000, largest=True).indices.numpy()

most_important = torch.topk( torch.tensor(attentions[most_right]), 5).indices.numpy()





counter_r = {}
for ind, att in zip(most_right, most_important):
    words = tokenizer.decode(embeddings[ind][att]).lower().split(' ')
    
    for word in words:
        if '[SEP]' in word:
            word = word.replace('[SEP]', '')
        if word not in counter_r:
            counter_r[word] = 1
        else:
            counter_r[word] += 1
            

most_popular_right = list(map(lambda x: x,sorted(counter_r.items(), key=lambda x: -x[1])))
print(most_popular_right[:100])


In [None]:
w = 'treasury'
print(counter_l[w])
print(counter_r[w])