In [1]:
import pandas as pd
from transformers import AutoTokenizer,AutoModel, BertTokenizer, BertModel
from datasets import load_dataset, Dataset
import nltk
from nltk.corpus import stopwords
import torch


In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kapsu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import numpy as np

In [4]:
imdb = load_dataset('imdb',split='train')
imdb = imdb.shard(8, index=1)
imdb.set_format("torch",columns=['text','label'])

In [5]:
def remove_stop_words(example):
    wrds = example['text'].split(' ')
    flts = [w for w in wrds if w.lower() not in stop_words]
    str = ""
    
    for f in flts:
        str+= f+" "
    
    new_one = {'text':str[:-1],'label':example['label']}
    return new_one

In [6]:
print(type(imdb))

<class 'datasets.arrow_dataset.Dataset'>


In [7]:
imdb.map(remove_stop_words)

Dataset({
    features: ['text', 'label'],
    num_rows: 3125
})

In [8]:
imdbb = imdb.train_test_split(test_size=0.2,stratify_by_column='label')
imdb_train = imdbb['train']
imdb_test = imdbb['test']

In [9]:
TRAIN_BATCH_SIZE = 12
TEST_BATCH_SIZE = 4

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
n_train = imdb_train.shape[0] / TRAIN_BATCH_SIZE
tokenized_train_batches = []
tokenized_train_batches_y = []

for i in range(int(n_train)):
    btch = imdb_train['text'][TRAIN_BATCH_SIZE*i : min(TRAIN_BATCH_SIZE*(i+1), imdb_train.shape[0])]
    tp = tokenizer.batch_encode_plus(btch,max_length=512, padding='max_length', truncation=True,return_tensors='pt')
    tokenized_train_batches.append(tp)
    tokenized_train_batches_y.append(imdb_train['label'][TRAIN_BATCH_SIZE*i : min(TRAIN_BATCH_SIZE*(i+1), imdb_train.shape[0])])
    
    if i % (n_train//10) == 0:
        print(f' finished {int((i / (n_train)) * 100)}%')

print(len(tokenized_train_batches))

In [14]:
n_test = imdb_test.shape[0] / TEST_BATCH_SIZE

tokenized_test_batches = []
tokenized_test_batches_y = []

for i in range(int(n_test)):
    btch = imdb_test['text'][TEST_BATCH_SIZE*i : min(TEST_BATCH_SIZE*(i+1), imdb_test.shape[0])]
    tp = tokenizer.batch_encode_plus(btch,max_length=512, padding='max_length', truncation=True,return_tensors='pt')
    tokenized_test_batches.append(tp)
    tokenized_test_batches_y.append(imdb_test['label'][TEST_BATCH_SIZE*i : min(TEST_BATCH_SIZE*(i+1), imdb_test.shape[0])])
    
    if i % (n_test//10) == 0:
        print(f' finished {int((i / (n_test)) * 100)}%')

print(len(tokenized_test_batches))
print(len(tokenized_test_batches_y))

 finished 0%
 finished 9%
 finished 19%
 finished 28%
 finished 38%
 finished 48%
 finished 57%
 finished 67%
 finished 76%
 finished 86%
 finished 96%
156
156


In [None]:
# tokenized_text_train = tokenizer.batch_encode_plus(imdb_train['text'],max_length=512, padding='max_length', truncation=True,return_tensors='pt')
# tokenized_text_train = tokenized_text_train.to(device)

In [None]:
# text_tensor_test = tokenizer.batch_encode_plus(imdb_test['text'],max_length=512, padding='max_length', truncation=True,return_tensors='pt')
# text_tensor_test = text_tensor_test.to(device)

In [4]:
class FBert(torch.nn.Module):
    
    def __init__(self) -> None:
        super(FBert,self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.2)
        self.l3 = torch.nn.Linear(768,64)
        self.l4 = torch.nn.Linear(64,1)
        self.l5 = torch.nn.Sigmoid()
        
    def forward(self, input):
        
        ids = input['input_ids']
        tto = input['token_type_ids']
        attn = input['attention_mask']
        
        _, t = self.l1(ids, attention_mask = attn, token_type_ids = tto, return_dict=False)
        # print(f'a1[0] -> {t.size()}')
        a2 = self.l2(t)
        # print(f'a2 -> {a2.size()}')
        a3 = self.l3(a2)
        # print(f' a3 -> {a3.size()}')
        a4 = self.l4(a3)
        # print(f' a4 -> {a4.size()}')
        a5 = self.l5(a4)
        # print(f' a5 -> {a5.size()}')
        a6 = a5.squeeze()
        # print(f' a6 -> {a6.size()}')
        return a6
        

In [7]:
my_model = FBert()
my_model = my_model.to(device)

In [9]:
from torchviz import make_dot

In [None]:
make_dot(yhat, params=dict(list(model.named_parameters()))).render("rnn_torchviz", format="png")


In [20]:
loss_fn = torch.nn.BCELoss()
optim = torch.optim.Adam(my_model.parameters(),lr=1e-5)

In [None]:
epochs = 10

In [36]:
test_loss = {}

In [None]:
def train_one_epoch(epoch=0):
    my_model.train()
    
    count = 0
    
    for batch, batch_y in zip(tokenized_train_batches, tokenized_train_batches_y):
        batch.to(device)
        preds = my_model(batch)
        
        actual = torch.as_tensor(batch_y,device=device,dtype=torch.float)
        loss = loss_fn(preds,actual)

        optim.zero_grad()

        loss.backward()

        optim.step()
        
        if count % 20 == 0:
            print(f'onto batch: {count}')
            
        count += 1

In [38]:
def evaluate(model = my_model, epoch=0):
    
    loss_graph = []
    losses = []
    
    with torch.no_grad():
        
        my_model.eval()
        
        count = 0
        
        total_loss = 0
        
        for b, b_y in zip(tokenized_test_batches, tokenized_test_batches_y):
            b.to(device)
            test_pred = my_model(b)
            actual_test = torch.as_tensor(b_y,device=device,dtype=torch.float)
            ts_ls = loss_fn(test_pred, actual_test)
            losses.append(ts_ls)
            total_loss += ts_ls
            count +=1
            print(f'Batch {count}, Test loss: {total_loss/count}')
            loss_graph.append(total_loss/count)
            
        total_loss /= len(tokenized_test_batches)
        test_loss[epoch] = total_loss
        print(f'Epoch {epoch}, Test loss: {total_loss}')
        return total_loss, loss_graph, losses

In [16]:
def evaluate_preds(model, epoch=0):
    
    predicted = []
    actual = []
    
    with torch.no_grad():
        
        my_model.eval()
        
        for b, b_y in zip(tokenized_test_batches, tokenized_test_batches_y):
            b.to(device)
            test_pred = model(b)
            predicted.extend(test_pred.cpu().tolist())
            actual.extend(b_y.flatten().tolist())


        return predicted,actual

In [None]:
torch.cuda.empty_cache()

In [None]:
# train_one_epoch()

In [None]:
# print(test_loss)

In [None]:
# torch.save(my_model,"temp1.pth")

In [21]:
nm = torch.load("temp1.pth")
nm = nm.to(device)

In [22]:
print(tokenized_test_batches_y[1])

tensor([1, 0, 0, 0])


In [23]:
nm(tokenized_test_batches[1].to(device))

tensor([0.9392, 0.0740, 0.0633, 0.0568], device='cuda:0',
       grad_fn=<SqueezeBackward0>)

In [None]:
torch.cuda.empty_cache()

In [24]:
p,a = evaluate_preds(nm)

In [25]:
print(len(p),len(a))

624 624


In [28]:
print(type(p))
pclass = [1 if x > 0.5 else 0 for x in p]

<class 'list'>


In [30]:
set(pclass)

{0, 1}

In [31]:
def calculate_metrics(p,a):
    ps = precision_score(a,p)
    rs = recall_score(a,p)
    f1 = f1_score(a,p)
    ac = accuracy_score(a,p)
    
    return ps,rs,f1,ac

In [32]:
print(calculate_metrics(pclass,a))

(0.9201277955271565, 0.9230769230769231, 0.9216, 0.9214743589743589)


In [39]:
evaluate(model = nm)

Batch 1, Test loss: 0.7024685144424438
Batch 2, Test loss: 0.696919858455658
Batch 3, Test loss: 0.6922356486320496
Batch 4, Test loss: 0.6941789388656616
Batch 5, Test loss: 0.6957554221153259
Batch 6, Test loss: 0.691014289855957
Batch 7, Test loss: 0.6935001611709595
Batch 8, Test loss: 0.6951670050621033
Batch 9, Test loss: 0.6951091885566711
Batch 10, Test loss: 0.6964712142944336
Batch 11, Test loss: 0.6939231753349304
Batch 12, Test loss: 0.6943578124046326
Batch 13, Test loss: 0.6961660385131836
Batch 14, Test loss: 0.6964450478553772
Batch 15, Test loss: 0.6965811848640442
Batch 16, Test loss: 0.6973094344139099
Batch 17, Test loss: 0.6980265378952026
Batch 18, Test loss: 0.6984477043151855
Batch 19, Test loss: 0.6990374326705933
Batch 20, Test loss: 0.6989561915397644
Batch 21, Test loss: 0.698300838470459
Batch 22, Test loss: 0.6986575722694397
Batch 23, Test loss: 0.6996200680732727
Batch 24, Test loss: 0.699577808380127
Batch 25, Test loss: 0.699425995349884
Batch 26, Test

(tensor(0.6960, device='cuda:0'),
 [tensor(0.7025, device='cuda:0'),
  tensor(0.6969, device='cuda:0'),
  tensor(0.6922, device='cuda:0'),
  tensor(0.6942, device='cuda:0'),
  tensor(0.6958, device='cuda:0'),
  tensor(0.6910, device='cuda:0'),
  tensor(0.6935, device='cuda:0'),
  tensor(0.6952, device='cuda:0'),
  tensor(0.6951, device='cuda:0'),
  tensor(0.6965, device='cuda:0'),
  tensor(0.6939, device='cuda:0'),
  tensor(0.6944, device='cuda:0'),
  tensor(0.6962, device='cuda:0'),
  tensor(0.6964, device='cuda:0'),
  tensor(0.6966, device='cuda:0'),
  tensor(0.6973, device='cuda:0'),
  tensor(0.6980, device='cuda:0'),
  tensor(0.6984, device='cuda:0'),
  tensor(0.6990, device='cuda:0'),
  tensor(0.6990, device='cuda:0'),
  tensor(0.6983, device='cuda:0'),
  tensor(0.6987, device='cuda:0'),
  tensor(0.6996, device='cuda:0'),
  tensor(0.6996, device='cuda:0'),
  tensor(0.6994, device='cuda:0'),
  tensor(0.6995, device='cuda:0'),
  tensor(0.6987, device='cuda:0'),
  tensor(0.6988, devi