In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

import matplotlib.pyplot as plt
import nltk
import re
import json

device = torch.device("cuda")

In [None]:
df_train = pd.read_csv("df_train.csv")
df_test = pd.read_csv("df_test.csv")


train_text = df_train['text']
train_labels = df_train['answer']

test_text = df_test['text']
test_labels = df_test['answer']



val_text, train_text, val_labels, train_labels = train_test_split(train_text, train_labels, 
                                                                random_state=42, 
                                                                test_size=0.7, 
                                                                stratify=train_labels)

In [None]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
max_seq_len = 50

tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

In [None]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [None]:

for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Arch(nn.Module):
    def __init__(self, bert):
      super(BERT_Arch, self).__init__()
      self.bert = bert   
      self.dropout = nn.Dropout(0.1)
      self.relu =  nn.ReLU()
      self.fc1 = nn.Linear(768,512)
      self.fc2 = nn.Linear(512,2)
      self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask): 
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)
      return x

model = BERT_Arch(bert)
model = model.to(device)


from transformers import AdamW
optimizer = AdamW(model.parameters(), lr = 1e-3)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_wts = compute_class_weight('balanced', np.unique(train_labels), train_labels)
print(class_wts)

weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)
cross_entropy  = nn.NLLLoss(weight=weights) 

In [None]:
from sklearn.metrics import f1_score
def train():
  model.train()
  total_loss, total_accuracy = 0, 0
  total_preds=[]
  for step,batch in enumerate(train_dataloader):
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r.to(device) for r in batch]
    sent_id, mask, labels = batch
    model.zero_grad()        
    preds = model(sent_id, mask)
    loss = cross_entropy(preds, labels)
    total_loss = total_loss + loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    preds=preds.detach().cpu().numpy()
    total_preds.append(preds)
  avg_loss = total_loss / len(train_dataloader)
  total_preds  = np.concatenate(total_preds, axis=0)
  return avg_loss, total_preds

def evaluate():
  model.eval()
  total_loss, total_accuracy = 0, 0
  total_preds = []
  for step,batch in enumerate(val_dataloader):
    if step % 50 == 0 and not step == 0:
      elapsed = format_time(time.time() - t0) 
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
    batch = [t.to(device) for t in batch]
    sent_id, mask, labels = batch
    with torch.no_grad():
      preds = model(sent_id, mask)
      loss = cross_entropy(preds,labels)
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
      
      total_preds.append(preds)
  avg_loss = total_loss / len(val_dataloader) 
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

In [None]:
best_valid_loss = float('inf')
train_losses=[]
valid_losses=[]
epochs = 30

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss, _ = train()
    valid_loss, _ = evaluate()
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights_new.pt')
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

In [None]:
x = range(1,len(train_losses)+1)

plt.figure(figsize=(16, 5))
plt.plot(x, train_losses, 'b', label='Training loss')
plt.plot(x, valid_losses, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.grid()
plt.legend()

In [None]:
path = 'saved_weights_new.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
import torch
from sklearn.metrics import accuracy_score, roc_curve, auc

saved_model = torch.load('bertowy_model')

def evaluate_roc(probs, y_true):

    preds = probs
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

with torch.no_grad():
  preds = saved_model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

In [None]:
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

evaluate_roc(preds, test_y)

pd.crosstab(test_y, preds)

In [None]:
import nltk
nltk.download('punkt')
import json
import pandas as pd
import time

In [None]:
def split_sentence_id(data, id):
  list_sentence = []
  list_id = []
  result = []
  for i in id:
      part_sentence = nltk.tokenize.sent_tokenize(data[i]['text']) 
      list_sentence.append(part_sentence)
      for j in range(len(part_sentence)):
        list_id.append(data[i]['id'])
  sentence_text = [item for sublist in list_sentence for item in sublist]
  for i in range(len(sentence_text)):
      if sentence_text[i][-1] != '.':
          sentence_text[i] = sentence_text[i] + '.'

  for i in range(len(sentence_text)):
    result.append({'text' : sentence_text[i], 'id': list_id[i]})


  return result


def load_data(file):
    with open(file) as f:
      data = json.load(f)
    return(data)


def save_data(file, data):
    with open(file, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent = 4)


def filtr_model(data, model):
    start_part = time.time()

    i=0
    c=0
    a=10
    result_abstract = []
    zbior = data
    
    
    for text in zbior:
        i+=1
        if round(i*100 / len(zbior)) == a:
            print("---------------- Progress: {:.0f}% ----------------".format(a)) 
            a +=10

        vec = pd.DataFrame(vectd.transform([text]).todense())
        result = model.predict(vec)
        if result == [1]:
            c += 1
            result_abstract.append(text)
    end_part = time.time()
    hours, rem = divmod(end_part-start_part, 3600)
    minutes, seconds = divmod(rem, 60)

    print("Czas: ","{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
    print('\n')
    print("Wielkośc zbioru: ", len(zbior))
    print("Wynik:", c)
    print("Procent zaakceptowanych: {:.2f}%".format(c*100/len(zbior)))

    save_data('filtered_data.json', result_abstract)
    return result_abstract

In [None]:
abstract = load_data('last_abstract.json')
data = pd.Series(abstract)
split_data = split_sentence_id(data)
len(split_data)

In [None]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
with torch.no_grad():

  preds = model(abs_seq.to(device), abs_mask.to(device))
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)

In [None]:
bert_model = torch.load('bertowy_model')

In [None]:
def predict_sentence(model, numb):
  dummy = 0
  for i in range(numb):
    split_list = []
    for j in range(200):
      split_list.append(split_data[i*200:(i+1)*200][j]['text'])
    split_data_part = pd.Series(split_list)

    max_seq_len = 50
    tokens_split = tokenizer.batch_encode_plus(
        split_data_part.tolist(),
        max_length = max_seq_len,
        pad_to_max_length=True,
        truncation=True,
        return_token_type_ids=False
    ) 
    abs_seq = torch.tensor(tokens_split['input_ids'])
    abs_mask = torch.tensor(tokens_split['attention_mask'])

    with torch.no_grad():
      preds = model(abs_seq.to(device), abs_mask.to(device))
      preds = preds.detach().cpu().numpy()

    preds = np.argmax(preds, axis = 1)
    dummy = np.append(dummy, preds)
    if(i%200) == 0:
      print((i+1), ":", dummy.shape)

  return np.delete(dummy, 0)

In [None]:
def get_acc_sentence(model, numb):
    data = predict_sentence(model, numb)
    a = 0
    final_sentence = []
    print(len(data))
    for i in range(len(data)):
      if data[i] == 1:
        a += 1
        final_sentence.append(split_data[i])

    print("Liczba zdań: ", len(data))
    print("Liczba zaakceptowanych: ", a)
    print("Procent zaakceptowanych: {:.2f}%".format(a*100/len(data)))
    
    return final_sentence

In [None]:
final_sent = get_acc_sentence(bert_model, 6000) 
save_data('wyniki_z_bertowego_modelu.json', final_sent)