In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import datasets
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

## Load the tokenized dataset from file
We flatten it so that each segment becomes a separate raw with corresponding label. 
Then we create Test and Evaluation subsets and clear memory afterwards

In [None]:
from datasets import load_from_disk
tokenized_dataset = load_from_disk(r"C:\Users\ivank\Documents\BERT_projects\Data\45000\final_validation_set_512_low")


In [None]:
#for loading and processing validation set

tokenized_dataset = load_from_disk('C:\\Users\\ivank\\Documents\\BERT_projects\\validation_set')


In [None]:
tokenized_dataset['labels'][:20]

In [None]:
flat_dataset = tokenized_dataset.to_pandas()

In [None]:
flat_dataset.head(10)

In [None]:
flat_dataset = flat_dataset.explode(['input_ids', 'token_type_ids', "attention_mask"]).reset_index(drop=True)

In [None]:
dataset = datasets.Dataset.from_pandas(flat_dataset)

In [None]:
#for loading and processing validation set

small_eval_dataset = dataset

In [None]:
tokenized_dataset_tosplit = dataset.train_test_split(test_size=0.1, shuffle = False) #shuffle=True, seed = 42)
small_train_dataset = tokenized_dataset_tosplit['train']
small_eval_dataset = tokenized_dataset_tosplit['test']

In [None]:
print(small_eval_dataset['labels'][:300])

In [None]:
small_train_dataset.save_to_disk(r"C:\Users\ivank\Documents\BERT_projects\small_train_data")

In [None]:
small_eval_dataset.save_to_disk(r"C:\Users\ivank\Documents\BERT_projects\small_eval_data")

In [None]:
small_eval_dataset.save_to_disk(r"C:\Users\ivank\Documents\BERT_projects\small_validation_data")

In [None]:
del flat_dataset
del tokenized_dataset_tosplit
del tokenized_dataset
del dataset

In [None]:
len(small_train_dataset[0]['input_ids'])

In [None]:
from datasets import load_from_disk
small_eval_dataset = load_from_disk(r"C:\Users\ivank\Documents\BERT_projects\Data\45000\final_validation_data_512_low")
small_train_dataset = load_from_disk(r"C:\Users\ivank\Documents\BERT_projects\Data\45000\final_processed_data_512_low")
#small_eval_dataset = load_from_disk(r"C:\Users\ivank\Documents\BERT_projects\new_small_eval_data")
#small_train_dataset = load_from_disk(r"C:\Users\ivank\Documents\BERT_projects\new_small_train_data")
#small_validation_dataset = load_from_disk(r"C:\Users\ivank\Documents\BERT_projects\new_small_validation_data")
#large_validation_dataset = load_from_disk(r"C:\Users\ivank\Documents\BERT_projects\new_sampled_data")

## Initialize PyTorch model
We create DataLoaders without shuffling, with each batch of segments covering one original text pair. 

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.empty_cache()

In [None]:
from torch.utils.data import DataLoader

small_train_dataset.set_format('torch')
small_eval_dataset.set_format('torch')
#small_validation_dataset.set_format('torch')
#large_validation_dataset.set_format('torch')

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=15)
eval_dataloader = DataLoader(small_eval_dataset, shuffle=True, batch_size=15)
#validation_dataloader = DataLoader(small_validation_dataset, shuffle=True, batch_size=30)
#large_validation_dataloader = DataLoader(large_validation_dataset, shuffle=True, batch_size=30)

In [None]:
from torch.utils.data import DataLoader

small_train_dataset.set_format('torch')
small_eval_dataset.set_format('torch')
#small_validation_dataset.set_format('torch')
#large_validation_dataset.set_format('torch')

train_dataloader = DataLoader(small_train_dataset, shuffle=False, batch_size=30)
eval_dataloader = DataLoader(small_eval_dataset, shuffle=False, batch_size=30)
#validation_dataloader = DataLoader(small_validation_dataset, shuffle=False, batch_size=30)
#large_validation_dataloader = DataLoader(large_validation_dataset, shuffle=False, batch_size=30)

In [None]:
model4 = ppb.AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels = 2)
model4.to(device)

## Training and evaluation of the initial model

In [None]:
optimizer = ppb.AdamW(model.parameters(), lr=2e-5)
num_epochs=5
num_training_steps=num_epochs*len(train_dataloader)
lr_scheduler = ppb.get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
from datasets import load_metric

metric1 = load_metric("accuracy")
metric2 = load_metric("f1")
model4.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model4(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric1.add_batch(predictions=predictions, references=batch["labels"])
        metric2.add_batch(predictions=predictions, references=batch["labels"])

print(metric1.compute(), metric2.compute())


In [None]:
from datasets import load_metric

metric = load_metric("accuracy")
model3.eval()
for batch in validation_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model3(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
from datasets import load_metric

metric = load_metric('accuracy')

def compute_metric(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions = predictions, references = labels)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results4",
    learning_rate=2e-5,
    per_device_train_batch_size=15,
    per_device_eval_batch_size=15,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model4,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=ppb.AutoTokenizer.from_pretrained("bert-base-cased"),
    compute_metrics=compute_metric,
)

In [None]:
trainer.train()#resume_from_checkpoint=True)

In [None]:
#epoch 5 eval_acc 0.8759461474367408, eval_loss 1.0895
#epoch 4 eval_acc 0.8716353111432706, eval_loss 0.8703
#epoch 3 eval_acc 0.8649212822874183, eval_loss 0.8140
#epoch 2 eval_acc 0.8605315090119721, eval_loss 0.6602
#epoch 1 eval_acc 0.8528921633118449, eval_loss 0.4931

In [None]:
from datasets import load_metric

metric = load_metric('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions = predictions, references = labels)

In [None]:
trainer.evaluate(eval_dataset=small_eval_dataset)

In [None]:
trainer.evaluate(eval_dataset=small_eval_dataset)

In [None]:
trainer.evaluate(eval_dataset=small_eval_dataset)
#evaluate non-pre-trained BERT

In [None]:
with torch.no_grad():
    for i, batch in enumerate(train_dataloader):
        print(batch)
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch, output_hidden_states=True)
        if i == 0:
            break
print(outputs.logits)

In [None]:
torch.save(model4, 'model_segm_torch')

In [None]:
torch.cuda.empty_cache()

## Load the saved model from file

In [None]:
#model2 = torch.load("C:\\Users\\ivank\\Documents\\BERT_projects\\model_segm_torch_old")
model2 = ppb.AutoModelForSequenceClassification.from_pretrained(r"C:\Users\ivank\Documents\BERT_projects\results2\checkpoint-315000")
model2.to(device)


In [None]:
model4 = ppb.AutoModelForSequenceClassification.from_pretrained(r"C:\Users\ivank\Documents\BERT_projects\results2\checkpoint-225000")
model4.to(device)

In [None]:
from torch.utils.data import DataLoader

small_train_dataset.set_format('torch')
small_eval_dataset.set_format('torch')

train_dataloader = DataLoader(small_train_dataset, shuffle=False, batch_size=15)
eval_dataloader = DataLoader(small_eval_dataset, shuffle=False, batch_size=15)

In [None]:
model4.eval()
with torch.no_grad():
    for i, batch in enumerate(eval_dataloader):
        print(batch)
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model4(**batch, output_hidden_states=True)
        if i == 0:
            break
print(outputs.logits)

In [None]:
#two ways to obtain embeddings: from four last layers and from the last layer
print(outputs.hidden_states[-1])

v = torch.stack(outputs.hidden_states[-4:]).sum(0)
print(v.shape, v[:,0,:].shape)

g = outputs.hidden_states[-1][:,0,:]
print(g.shape)

In [None]:
v = torch.stack(outputs.hidden_states[-4:]).sum(0)

In [None]:
del h

## Obtain embeddings and save to tensor

In [None]:
#train_outputs_last = torch.Tensor()
#train_outputs_last = train_outputs_last.to(device)

train_outputs_four = torch.Tensor()
train_outputs_four = train_outputs_four.to(device)

train_outputs_logits = torch.Tensor()
train_outputs_logits = train_outputs_logits.to(device)

print(len(train_dataloader))
model4.eval()
with torch.no_grad():
    for i, batch in enumerate(train_dataloader):
        if i % 1000 == 0:
            print(i) 

        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model4(**batch, output_hidden_states=True)
        #cls = outputs.hidden_states[-1][:,0,:] # obtain last hidden layer's CLS tokens. [:,0,:] meaning: ':' for all sequences, '0' for first token in sequence, ':' for all 768 hidden layers
        #train_outputs_last = torch.cat((train_outputs_last, cls), 0)

        four = torch.stack(outputs.hidden_states[-4:]).sum(0)
        cls_four = four[:,0,:]
        train_outputs_four = torch.cat((train_outputs_four, cls_four), 0)

        logits = outputs.logits
        train_outputs_logits = torch.cat((train_outputs_logits, logits), 0)

In [None]:
train_outputs = train_outputs.to('cpu')

In [None]:
#train_outputs_last = torch.reshape(train_outputs_last, (7601, 30, 768))
#torch.save(train_outputs_last, "C:\\Users\\ivank\\Documents\\BERT_projects\\512_1_train_outputs_reshaped.pt")

train_outputs_four = torch.reshape(train_outputs_four, (45000, 30, 768))
torch.save(train_outputs_four, "C:\\Users\\ivank\\Documents\\BERT_projects\\512_4_train_outputs_reshaped.pt")

train_outputs_logits = torch.reshape(train_outputs_logits, (45000, 30, 2))
torch.save(train_outputs_logits, "C:\\Users\\ivank\\Documents\\BERT_projects\\512_train_outputs_logits_reshaped.pt")

In [None]:
#train_outputs = torch.reshape(train_outputs, (45000, 30, 768))
#torch.save(train_outputs,"C:\\Users\\ivank\\Documents\\BERT_projects\\512_1_train_outputs_reshaped.pt")

#train_outputs = torch.reshape(train_outputs, (45000, 30, 2))
#torch.save(train_outputs, "C:\\Users\\ivank\\Documents\\BERT_projects\\new_train_outputs_logits_reshaped.pt")

#train_outputs = torch.reshape(train_outputs, (45000, 30, 768))
#torch.save(train_outputs, "C:\\Users\\ivank\\Documents\\BERT_projects\\Embeddings\\Untuned\\final_train_outputs_reshaped.pt")


In [None]:
eval_outputs_last = torch.Tensor()
eval_outputs_last = eval_outputs_last.to(device)

eval_outputs_four = torch.Tensor()
eval_outputs_four = eval_outputs_four.to(device)

eval_outputs_logits = torch.Tensor()
eval_outputs_logits = eval_outputs_logits.to(device)

print(len(eval_dataloader))
with torch.no_grad():
    for i, batch in enumerate(eval_dataloader):
        if i % 100 == 0:
            print(i) 

        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model4(**batch, output_hidden_states=True)
        cls = outputs.hidden_states[-1][:,0,:] # obtain last hidden layer's CLS tokens. [:,0,:] meaning: ':' for all sequences, '0' for first token in sequence, ':' for all 768 hidden layers
        eval_outputs_last = torch.cat((eval_outputs_last, cls), 0)

        four = torch.stack(outputs.hidden_states[-4:]).sum(0)
        cls_four = four[:,0,:]
        eval_outputs_four = torch.cat((eval_outputs_four, cls_four), 0)

        logits = outputs.logits
        eval_outputs_logits = torch.cat((eval_outputs_logits, logits), 0)

In [None]:
eval_outputs_last = torch.reshape(eval_outputs_last, (7601, 30, 768))
torch.save(eval_outputs_last, "C:\\Users\\ivank\\Documents\\BERT_projects\\512_1_eval_outputs_reshaped.pt")

eval_outputs_four = torch.reshape(eval_outputs_four, (7601, 30, 768))
torch.save(eval_outputs_four, "C:\\Users\\ivank\\Documents\\BERT_projects\\512_4_eval_outputs_reshaped.pt")

eval_outputs_logits = torch.reshape(eval_outputs_logits, (7601, 30, 2))
torch.save(eval_outputs_logits, "C:\\Users\\ivank\\Documents\\BERT_projects\\512_eval_outputs_logits_reshaped.pt")

In [None]:
eval_outputs = torch.Tensor()
eval_outputs = eval_outputs.to(device)

print(len(eval_dataloader))
with torch.no_grad():
    for i, batch in enumerate(eval_dataloader):
        if i % 100 == 0:
            print(i) 

        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model4(**batch, output_hidden_states=True)
        cls = outputs.hidden_states[-1][:,0,:] # obtain last hidden layer's CLS tokens. [:,0,:] meaning: ':' for all sequences, '0' for first token in sequence, ':' for all 768 hidden layers
        eval_outputs = torch.cat((eval_outputs, cls), 0)
        #logits = outputs.logits
        #eval_outputs = torch.cat((eval_outputs, logits), 0)

In [None]:
#eval_outputs = torch.reshape(eval_outputs, (4734, 30, 2))
#torch.save(eval_outputs, "C:\\Users\\ivank\\Documents\\BERT_projects\\new_eval_outputs_logits_reshaped.pt")

eval_outputs = torch.reshape(eval_outputs, (7601, 30, 768))
torch.save(eval_outputs, "C:\\Users\\ivank\\Documents\\BERT_projects\\ffinal_eval_outputs_reshaped.pt")

In [None]:
#valid_outputs = torch.reshape(eval_outputs, (5261, 30, 2))
#torch.save(valid_outputs, "C:\\Users\\ivank\\Documents\\BERT_projects\\new_validation_outputs_logits_reshaped.pt")

valid_outputs = torch.reshape(eval_outputs, (5261, 30, 768))
torch.save(valid_outputs, "C:\\Users\\ivank\\Documents\\BERT_projects\\new_validation_outputs_reshaped.pt")

## Reshaping and saving labels for Train and Eval

In [None]:
train_output_labels = small_train_dataset['labels']
eval_output_labels = small_eval_dataset['labels']

eval_output_labels = torch.reshape(eval_output_labels, (7601, 30))
train_output_labels = torch.reshape(train_output_labels, (45000, 30))

eval_output_labels = eval_output_labels[:,0]
train_output_labels = train_output_labels[:,0]

torch.save(eval_output_labels, "C:\\Users\\ivank\\Documents\\BERT_projects\\512_eval_output_labels.pt")
torch.save(train_output_labels, "C:\\Users\\ivank\\Documents\\BERT_projects\\512_train_output_labels.pt")

In [None]:
eval_output_labels[0:100]

## Reshaping and saving labels for Validation sets

In [None]:
valid_output_labels = small_validation_dataset['labels']
valid_output_labels = torch.reshape(valid_output_labels, (5261, 30))
valid_output_labels = valid_output_labels[:,0]
torch.save(valid_output_labels, "C:\\Users\\ivank\\Documents\\BERT_projects\\new_validation_output_labels.pt")

valid_outputs = torch.reshape(eval_outputs, (5261, 30, 2))
torch.save(valid_outputs, "C:\\Users\\ivank\\Documents\\BERT_projects\\valid_outputs_logits_reshaped.pt")

In [None]:
large_valid_output_labels = large_validation_dataset['labels']
large_valid_output_labels = torch.reshape(large_valid_output_labels, (10000, 30))
large_valid_output_labels = large_valid_output_labels[:,0]
torch.save(large_valid_output_labels, "C:\\Users\\ivank\\Documents\\BERT_projects\\new_large_validation_output_labels.pt")

large_valid_outputs = torch.reshape(eval_outputs, (10000, 30, 768))
torch.save(large_valid_outputs, "C:\\Users\\ivank\\Documents\\BERT_projects\\new_large_validation_outputs_reshaped.pt")

## Unsqueezed labels

In [None]:
eval_output_labels = small_eval_dataset['labels']
#eval_output_labels = eval_output_labels.float().to('cpu')
torch.save(eval_output_labels, "C:\\Users\\ivank\\Documents\\BERT_projects\\unsqueezed_eval_output_labels.pt")

In [None]:
valid_output_labels = small_validation_dataset['labels']
torch.save(valid_output_labels, "C:\\Users\\ivank\\Documents\\BERT_projects\\unsqueezed_validation_output_labels.pt")

In [None]:
train_output_labels = small_train_dataset['labels']
#train_output_labels = train_output_labels.float().to('cpu')
torch.save(train_output_labels, "C:\\Users\\ivank\\Documents\\BERT_projects\\unsqueezed_train_output_labels.pt")