In [1]:
import pandas as pd
from transformers import AutoTokenizer,AutoModel, BertTokenizer, BertForSequenceClassification
from datasets import load_dataset, Dataset
import nltk
from nltk.corpus import stopwords
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kapsu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
imdb = load_dataset('imdb',split='test')
imdb.set_format("torch",columns=['text','label'])

In [5]:
def remove_stop_words(example):
    wrds = example['text'].split(' ')
    flts = [w for w in wrds if w.lower() not in stop_words]
    str = ""
    
    for f in flts:
        str+= f+" "
    
    new_one = {'text':str[:-1],'label':example['label']}
    return new_one

In [12]:
imdb = imdb.train_test_split(0.6,stratify_by_column='label')['train']

In [13]:
imdb.map(remove_stop_words)

Map: 100%|██████████| 7500/7500 [00:01<00:00, 6562.85 examples/s]


Dataset({
    features: ['text', 'label'],
    num_rows: 7500
})

In [14]:
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity")

In [15]:
loss_fn = torch.nn.BCEWithLogitsLoss()

In [16]:
n = imdb.shape[0]
BATCH_SIZE = 8

shards = int(n / BATCH_SIZE)

loss = 0

loss_rec = {}

all_preds = []
all_acts = []

for i in range(int(shards/5)):
    
    batch = imdb.shard(shards,i,contiguous=True)
    
    X = batch['text']
    y = batch['label']
    
    with torch.no_grad():
        c1 = torch.tensor([0,1])
        c2 = torch.tensor([1,0])
    
        temp = [c1 if y_t == 1 else c2 for y_t in y]
    
        ytt = torch.stack(temp).float()
        tokenized = tokenizer(X,padding='max_length', max_length=512, truncation=True, return_tensors='pt')
    
        preds = model(**tokenized)
        
        all_preds.extend(np.argmax(preds.logits,axis = 1))
        all_acts.extend(y)
        
        smx = torch.nn.Softmax(preds.logits)
    
        t_ls = loss_fn(preds.logits,ytt)
        loss += t_ls
    
    loss_rec[i] = loss/(i+1)
    print(f'avg loss after {i}: {loss/(i+1)}')
    
print(f'net loss: {loss}')
print(f'avg loss: {loss/shards}')

avg loss after 0: 0.03904830291867256
avg loss after 1: 0.035252440720796585
avg loss after 2: 0.2532491981983185
avg loss after 3: 0.2171950489282608
avg loss after 4: 0.25441059470176697
avg loss after 5: 0.21966715157032013
avg loss after 6: 0.32000455260276794
avg loss after 7: 0.3235739469528198
avg loss after 8: 0.37583112716674805
avg loss after 9: 0.45144858956336975
avg loss after 10: 0.4319971799850464
avg loss after 11: 0.4373348653316498
avg loss after 12: 0.40742021799087524
avg loss after 13: 0.3796795904636383
avg loss after 14: 0.397330105304718
avg loss after 15: 0.49873602390289307
avg loss after 16: 0.5364951491355896
avg loss after 17: 0.5574448108673096
avg loss after 18: 0.5290433764457703
avg loss after 19: 0.5425915718078613
avg loss after 20: 0.5584104061126709
avg loss after 21: 0.550232470035553
avg loss after 22: 0.5502983927726746
avg loss after 23: 0.5306203961372375
avg loss after 24: 0.5349090099334717
avg loss after 25: 0.5355023741722107
avg loss after

In [None]:
print(len(loss_rec))

In [None]:
print(type(loss_rec[5]))

In [None]:
import json

In [17]:
def calculate_metrics(p,a):
    ps = precision_score(a,p)
    rs = recall_score(a,p)
    f1 = f1_score(a,p)
    ac = accuracy_score(a,p)
    
    return ps,rs,f1,ac

In [18]:
print(len(all_preds))

1500


In [19]:
print(len(all_acts))

1500


In [20]:
print(calculate_metrics(all_acts,all_preds))

(0.8810126582278481, 0.8721804511278195, 0.8765743073047859, 0.8693333333333333)
