## Data loading and pipelines for different formats of prediction output

In [None]:
import json
import pandas as pd
import numpy as np
import datasets
import transformers
import torch
import torch.nn as nn
import torch.utils.data
import warnings
from torch.utils.data import DataLoader

warnings.filterwarnings('ignore')

# following function is adopted from bert4keras package (https://github.com/bojone/bert4keras)
# we do not import this package to avoid compadibility issues (keras < 2.3.1 is required for this package, while a later version is already used)
# if bert4keras package is already installed, this function can be loaded as follows:
# from bert4keras.snippets import text_segmentate

def text_segmentate(text, maxlen, seps='\n', strips=None):
    """将文本按照标点符号划分为若干个短句
    """
    text = text.strip().strip(strips)
    if seps and len(text) > maxlen:
        pieces = text.split(seps[0])
        text, texts = '', []
        for i, p in enumerate(pieces):
            if text and p and len(text) + len(p) > maxlen - 1:
                texts.extend(text_segmentate(text, maxlen, seps[1:], strips))
                text = ''
            if i + 1 == len(pieces):
                text = text + p
            else:
                text = text + p + seps[0]
        if text:
            texts.extend(text_segmentate(text, maxlen, seps[1:], strips))
        return texts
    else:
        return [text]


# following function is adopted from https://github.com/Pzeyang/task-for-authorship-verification
# a custom version tailored to our project will be added later

def get_data(jsonl_dataset_path):
    """
    Get data from JSONL dataset. Used in plain_pipeline and pipeline
    """

    with open(jsonl_dataset_path, 'r') as f:

        datas = []
        for l in f:
            data = json.loads(l)
            text1 = text_segmentate(data['pair'][0], maxlen=510, seps='.?!')
            text2 = text_segmentate(data['pair'][1], maxlen=510, seps='.?!')
            while len(text1) < 30 or len(text2) < 30:
                if len(text1) < 30:
                    n_text1 = []
                    for i in range(30):
                        for sent in text1:
                            n_text1.append(sent)
                    text1 = n_text1
                elif len(text2) < 30:
                    n_text2 = []
                    for i in range(30):
                        for sent in text2:
                            n_text2.append(sent)
                    text2 = n_text2
            datas.append((text1, text2, str(data['id'])))

        return datas

# different data extractors for different types of input. See description to find in which pipeline each one should be used

def get_data_from_two_textfiles(text1_path, text2_path):
    """
    Get data from a two text files, one for each fragment. Used in pipeline 
    """

    print("Getting data from raw texts")

    datas = []
    with open(text1_path, 'r') as text1, open(text2_path, 'r') as text2:
        text1, text2 = text1.read(), text2.read()
        text1 = text_segmentate(text1, maxlen=510, seps='.?!')
        text2 = text_segmentate(text2, maxlen=510, seps='.?!')
        while len(text1) < 30 or len(text2) < 30:
                if len(text1) < 30:
                    n_text1 = []
                    for i in range(30):
                        for sent in text1:
                            n_text1.append(sent)
                    text1 = n_text1
                elif len(text2) < 30:
                    n_text2 = []
                    for i in range(30):
                        for sent in text2:
                            n_text2.append(sent)
                    text2 = n_text2
        datas.append((text1, text2))

    return datas

def get_data_from_single_textfile(text_path):
    """
    Get data from a text file that contains two texts and a separator. Currently not used
    """

    print("Getting data from single raw text file")

    datas = []
    with open(text_path, 'r') as text:
        text = text.read()
        text1, text2 = text.split("$&*&*&$")
        text1 = text_segmentate(text1, maxlen=510, seps='.?!')
        text2 = text_segmentate(text2, maxlen=510, seps='.?!')
        while len(text1) < 30 or len(text2) < 30:
                if len(text1) < 30:
                    n_text1 = []
                    for i in range(30):
                        for sent in text1:
                            n_text1.append(sent)
                    text1 = n_text1
                elif len(text2) < 30:
                    n_text2 = []
                    for i in range(30):
                        for sent in text2:
                            n_text2.append(sent)
                    text2 = n_text2
        datas.append((text1, text2))

    return datas

def get_data_from_combined_texts(text_or_list):
    """
    Get data from raw text that contains two fragments and a separater, or from a list of texts,
    each of them containing two fragments and a separater. Used in pipeline_onetext. The ONLY type
    of data processor for LIME inputs
    """

    print("Getting data from raw text")

    datas = []

    print(type(text_or_list), len(text_or_list))
    if not isinstance(text_or_list, str):
        for text_variant in text_or_list:
            text1, text2 = text_variant.split("$&*&*&$")
            text1 = text_segmentate(text1, maxlen=510, seps='.?!')
            text2 = text_segmentate(text2, maxlen=510, seps='.?!')
            while len(text1) < 30 or len(text2) < 30:
                    if len(text1) < 30:
                        n_text1 = []
                        for i in range(30):
                            for sent in text1:
                                n_text1.append(sent)
                        text1 = n_text1
                    elif len(text2) < 30:
                        n_text2 = []
                        for i in range(30):
                            for sent in text2:
                                n_text2.append(sent)
                        text2 = n_text2
            datas.append((text1, text2))
    else:
        text1, text2 = text_or_list.split("$&*&*&$")
        text1 = text_segmentate(text1, maxlen=510, seps='.?!')
        text2 = text_segmentate(text2, maxlen=510, seps='.?!')
        while len(text1) < 30 or len(text2) < 30:
                if len(text1) < 30:
                    n_text1 = []
                    for i in range(30):
                        for sent in text1:
                            n_text1.append(sent)
                    text1 = n_text1
                elif len(text2) < 30:
                    n_text2 = []
                    for i in range(30):
                        for sent in text2:
                            n_text2.append(sent)
                    text2 = n_text2
        datas.append((text1, text2))
    return datas


global tokenizer 
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
def tokenize_function(example):
    #tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
    return tokenizer(example['0'][:30], example['1'][:30], truncation=True, padding='max_length', max_length=255)

# used to creat text input for pipelines and explainers
def combine_texts(index, write=False):
    """
    Combine a pair of texts from dataset with a separator and turn into a single text
    """

    text1 = orig_data['pair'][index][0]
    text2 = orig_data['pair'][index][1]
    text_combined = text1 + "$&*&*&$" + text2
    
    if write:
        name = "textcomb{}.txt".format(index)
        with open(name, 'w') as textcomb:
            textcomb.write(text_combined)

    return(text_combined)

## Initialize a final classifier (identical to FinalNetAvg in Final_model_PT)

In [None]:
class FinalNetAvg(nn.Module):

    def __init__(self, num_classes=2):
        super(FinalNetAvg, self).__init__()
        self.avgpool = nn.AdaptiveAvgPool2d((1, 768))
        self.classifier = nn.Sequential(
            nn.Linear(768, 16),
            nn.ReLU(),
            nn.Linear(16, num_classes)
        )

    def forward(self, x):
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

## Get a standard JSONL dataset with 100 pairs and get corresponding truth labels

Create a standard JSONL with a set number of pairs

In [None]:
df = pd.read_json("D:\pan20-authorship-verification-training-small\pan20-authorship-verification-training-small.jsonl", lines = True)
df = df.sample(n = 1)
df.to_json("D:\pan20-authorship-verification-training-small\pan20-authorship-verification-training-small-one.jsonl", orient='records', lines = True)

In [None]:
# load untokenized evaluation set 
from datasets import load_from_disk
df = load_from_disk("C:\\Users\\ivank\\Documents\\BERT_projects\\can_delete")
df = df.to_pandas()

In [None]:
orig_data = pd.read_json("D:\pan20-authorship-verification-training-small\pan20-authorship-verification-training-small-verysmall.jsonl", lines = True)
all_trues = pd.read_json("D:\pan20-authorship-verification-training-small\pan20-authorship-verification-training-small-truth.jsonl", lines = True)

In [None]:
trues = pd.merge(orig_data, all_trues, on=['id'], how='inner')
trues['same'] = trues['same'].astype(int)
labels = trues['same'].array

#np.equal(predictions, labels)

In [None]:
trues.to_csv("100 examples to explain.csv", index=False)

## Get input data in custom format (otherwise use combine_text function)

In [None]:
"""
Create a pair of texts from dataset
"""

with open("text3.txt", 'w') as text1, open("text4.txt", 'w') as text2:
    text1.write(orig_data['pair'][0][0])
    text2.write(orig_data['pair'][0][1])

In [None]:
"""
Combine a pair of texts from files with a separator and turn into a single text
"""

with open("text3.txt", 'r') as text1, open("text4.txt", 'r') as text2, open("textcomb2.txt", 'w') as textcomb:
    text_combined = text1.read() + "$&*&*&$" + text2.read()
    textcomb.write(text_combined)


In [None]:
def combine_segments_from_pd(textindex, segmentindex, sep_option=0, write=False):
    """
    Combine a pair of segments from pandas dataset with a separator and turn into a single text
    """

    text1 = df['0'][textindex][segmentindex]
    text2 = df['1'][textindex][segmentindex]
    sep = "$&*&*&$" if sep_option == 0 else "[SEP]"
    text_combined = text1 + sep + text2
    
    if write:
        name = "textcomb{}_{}.txt".format(textindex, segmentindex)
        with open(name, 'w') as textcomb:
            textcomb.write(text_combined)

    return(text_combined)

# Pipelines

In [None]:
data_path = "D:\pan20-authorship-verification-training-small\pan20-authorship-verification-training-small-one.jsonl"

def plain_pipeline(data_path):
    """
    Pipeline for input from a regular JSONL dataset 
    """

    segmented_data = get_data(data_path)
    dataset = datasets.Dataset.from_pandas(pd.DataFrame(segmented_data))
    del segmented_data

    print("Tokenization...")

    #only ititialize tokenizer if you don't do it before calling the function (which is faster)
    #global tokenizer 
    #tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
    tokenized_dataset = dataset.map(tokenize_function)
    tokenized_dataset = tokenized_dataset.remove_columns(['0', '1', '2'])
    #print(tokenizer.decode(tokenized_dataset[0]['input_ids'][0]))

    flat_dataset = tokenized_dataset.to_pandas()
    flat_dataset = flat_dataset.explode(['input_ids', 'token_type_ids', "attention_mask"]).reset_index(drop=True)
    dataset = datasets.Dataset.from_pandas(flat_dataset)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()

    model_feature_extract = transformers.AutoModelForSequenceClassification.from_pretrained(r"C:\Users\ivank\Documents\BERT_projects\results2\checkpoint-225000")
    model_feature_extract.to(device)
    print("Obtaining embeddings...")

    dataset.set_format('torch')
    eval_dataloader = DataLoader(dataset, shuffle=False, batch_size=30)

    eval_outputs = torch.Tensor()
    eval_outputs = eval_outputs.to(device)

    with torch.no_grad():
        for i, batch in enumerate(eval_dataloader):
            if i % 10 == 0:
                print(">{} processing batch {}/{}".format(i//10*">", i, len(eval_dataloader))) 

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model_feature_extract(**batch, output_hidden_states=True)
            cls = outputs.hidden_states[-1][:,0,:] # obtain last hidden layer's CLS tokens. [:,0,:] meaning: ':' for all sequences, '0' for first token in sequence, ':' for all 768 hidden layers
            eval_outputs = torch.cat((eval_outputs, cls), 0)

    eval_outputs = torch.reshape(eval_outputs, (len(eval_dataloader), 30, 768))

    print("Making predictions...")

    model_classify = FinalNetAvg()
    model_classify.load_state_dict(torch.load(r"C:\Users\ivank\Documents\BERT_projects\Classifier\model.pth"))
    model_classify.to(device)

    logits = model_classify(eval_outputs)
    predictions = torch.argmax(logits, dim=-1)
    print("Done!")
    return predictions.cpu().numpy()

## Pipeline with input either from regular JSONL dataset (1 argument) or from a pair of texts (2 arguments)

In [None]:
data_path = "D:\pan20-authorship-verification-training-small\pan20-authorship-verification-training-small-one.jsonl"

def pipeline(data_path, data_path2=None, mode='probs'):
    """
    Pipeline with input either from regular JSONL dataset (1 argument) or from a pair of texts (2 arguments)
    """

    segmented_data = get_data_from_two_textfiles(data_path, data_path2) if data_path2 else get_data(data_path)
    dataset = datasets.Dataset.from_pandas(pd.DataFrame(segmented_data))
    del segmented_data

    print("Tokenization...")

    #only ititialize tokenizer if you don't do it before calling the function (which is faster)
    #global tokenizer 
    #tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
    tokenized_dataset = dataset.map(tokenize_function)
    #print(tokenizer.decode(tokenized_dataset[0]['input_ids'][0]))
    
    flat_dataset = tokenized_dataset.to_pandas()
    flat_dataset = flat_dataset.drop(['0', '1'], axis=1)
    if '2' in flat_dataset: #we may or may not have this column depending on the input type
         flat_dataset = flat_dataset.drop(['2'], axis=1)
    flat_dataset = flat_dataset.explode(['input_ids', 'token_type_ids', "attention_mask"]).reset_index(drop=True)
    dataset = datasets.Dataset.from_pandas(flat_dataset)

    global datacheck
    datacheck = flat_dataset

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()

    model_feature_extract = transformers.AutoModelForSequenceClassification.from_pretrained(r"C:\Users\ivank\Documents\BERT_projects\results2\checkpoint-225000")
    model_feature_extract.to(device)
    print("Obtaining embeddings...")

    dataset.set_format('torch')
    eval_dataloader = DataLoader(dataset, shuffle=False, batch_size=30)

    eval_outputs = torch.Tensor()
    eval_outputs = eval_outputs.to(device)

    model_feature_extract.eval()
    with torch.no_grad():
        for i, batch in enumerate(eval_dataloader):
            if i % 10 == 0:
                print(">{} processing batch {}/{}".format(i//10*">", i, len(eval_dataloader))) 

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model_feature_extract(**batch, output_hidden_states=True)
            cls = outputs.hidden_states[-1][:,0,:] # obtain last hidden layer's CLS tokens. [:,0,:] meaning: ':' for all sequences, '0' for first token in sequence, ':' for all 768 hidden layers
            eval_outputs = torch.cat((eval_outputs, cls), 0)

    eval_outputs = torch.reshape(eval_outputs, (len(eval_dataloader), 30, 768))

    print("Making predictions...")

    model_classify = FinalNetAvg()
    model_classify.load_state_dict(torch.load(r"C:\Users\ivank\Documents\BERT_projects\Classifier\model.pth"))
    model_classify.to(device)

    model_classify.eval()
    with torch.no_grad():
        logits = model_classify(eval_outputs)
    
    predictions = []
    for prediction in logits:
        if mode == 'labels':
            prediction = torch.argmax(prediction, dim=-1)
            prediction = prediction.cpu().numpy()
        elif mode == 'probs':
            m = nn.Softmax()
            prediction = m(prediction)
            prediction = prediction.cpu().numpy()
            prediction = np.around(prediction, decimals=3)
            #prediction = prediction.tolist()
        else:
            prediction = prediction.cpu().numpy()
        predictions.append(prediction)
    print("Done!")
    return predictions

In [None]:
predictions_from_pair = pipeline("text1.txt", "text2.txt", mode='probs')

Getting data from raw text
Tokenization...


1ex [00:00, 333.33ex/s]


Obtaining embeddings...
> processing batch 0/1
Making predictions...
torch.Size([1, 2])
prediction:  tensor([ 8.3187, -4.1354], device='cuda:0')
Done!


In [None]:
predictions_from_jsonl = pipeline("D:\pan20-authorship-verification-training-small\pan20-authorship-verification-training-small-one.jsonl", mode='probs')

Tokenization...


1ex [00:00, 332.96ex/s]


Obtaining embeddings...
> processing batch 0/1
Making predictions...
Done!


## Pipeline with input from a combined text or a list of combined texts

In [None]:
def pipeline_onetext(data_path, mode='probs'):
    """
    Pipeline with input from a combined text or a list of combined texts
    """

    segmented_data = get_data_from_combined_texts(data_path)
    dataset = datasets.Dataset.from_pandas(pd.DataFrame(segmented_data))
    del segmented_data

    print("Tokenization...")

    #only ititialize tokenizer if you don't do it before calling the function (which is faster)
    #global tokenizer 
    #tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
    tokenized_dataset = dataset.map(tokenize_function)
    #print(tokenizer.decode(tokenized_dataset[0]['input_ids'][0]))

    flat_dataset = tokenized_dataset.to_pandas()
    flat_dataset = flat_dataset.drop(['0', '1'], axis=1)
    if '2' in flat_dataset: #we may or may not have this column depending on the input type
         flat_dataset = flat_dataset.drop(['2'], axis=1)
    flat_dataset = flat_dataset.explode(['input_ids', 'token_type_ids', "attention_mask"]).reset_index(drop=True)
    dataset = datasets.Dataset.from_pandas(flat_dataset)

    global datacheck
    datacheck = flat_dataset

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()

    model_feature_extract = transformers.AutoModelForSequenceClassification.from_pretrained(r"C:\Users\ivank\Documents\BERT_projects\BERT\results_45000\checkpoint-225000")
    model_feature_extract.to(device)
    print("Obtaining embeddings...")

    dataset.set_format('torch')
    eval_dataloader = DataLoader(dataset, shuffle=False, batch_size=30)

    eval_outputs = torch.Tensor()
    eval_outputs = eval_outputs.to(device)

    model_feature_extract.eval()
    with torch.no_grad():
        for i, batch in enumerate(eval_dataloader):
            print(batch)
            step = 10 if (len(eval_dataloader) < 100) else 100
            if i % step == 0:
                print(">{} processing item {}/{}".format(int((i/len(eval_dataloader))*10)*">", i, len(eval_dataloader))) 

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model_feature_extract(**batch, output_hidden_states=True)
            cls = outputs.hidden_states[-1][:,0,:] # obtain last hidden layer's CLS tokens. [:,0,:] meaning: ':' for all sequences, '0' for first token in sequence, ':' for all 768 hidden layers
            eval_outputs = torch.cat((eval_outputs, cls), 0)

    eval_outputs = torch.reshape(eval_outputs, (len(eval_dataloader), 30, 768))

    #Save the text embedding for future analysis
    global embedding
    embedding = eval_outputs

    print("Making predictions...")

    model_classify = FinalNetAvg()
    model_classify.load_state_dict(torch.load(r"C:\Users\ivank\Documents\BERT_projects\Classifier\model.pth"))
    model_classify.to(device)

    model_classify.eval()
    with torch.no_grad():
        logits = model_classify(eval_outputs)
    
    predictions = []
    for prediction in logits:
        if mode == 'labels':
            prediction = torch.argmax(prediction, dim=-1)
            prediction = prediction.cpu().numpy()
        elif mode == 'probs':
            m = nn.Softmax()
            prediction = m(prediction)
            prediction = prediction.cpu().numpy()
            prediction = np.around(prediction, decimals=3)
            #prediction = prediction.tolist()
        else:
            prediction = prediction.cpu().numpy()
        predictions.append(prediction)
    print("Done!")
    return np.array(predictions)

In [None]:
def get_data_from_combined_segments(text_or_list):
    """
    Get data from raw text that contains two fragments and a separater, or from a list of texts,
    each of them containing two fragments and a separater. Used in pipeline_onetext. The ONLY type
    of data processor for LIME inputs
    """

    #print("Getting data from raw text")

    datas = []

    #print(type(text_or_list), len(text_or_list))
    if not isinstance(text_or_list, str):
        for text_variant in text_or_list:
            #print(text_variant)
            text1, text2 = text_variant.split("$&*&*&$")
            datas.append(([text1], [text2]))
    else:
        text1, text2 = text_or_list.split("$&*&*&$")
        datas.append(([text1], [text2]))
    return datas

def pipeline_onesegment(data_path, mode='probs'):
    
    segmented_data = get_data_from_combined_segments(data_path)
    dataset = datasets.Dataset.from_pandas(pd.DataFrame(segmented_data))
    del segmented_data

    #print("Tokenization...")

    #only ititialize tokenizer if you don't do it before calling the function (which is faster)
    #global tokenizer 
    #tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
    tokenized_dataset = dataset.map(tokenize_function)
    #print(tokenizer.decode(tokenized_dataset[0]['input_ids'][0]))

    flat_dataset = tokenized_dataset.to_pandas()
    flat_dataset = flat_dataset.drop(['0', '1'], axis=1)
    if '2' in flat_dataset: #we may or may not have this column depending on the input type
         flat_dataset = flat_dataset.drop(['2'], axis=1)
    flat_dataset = flat_dataset.explode(['input_ids', 'token_type_ids', "attention_mask"]).reset_index(drop=True)
    dataset = datasets.Dataset.from_pandas(flat_dataset)

    global datacheck
    datacheck = flat_dataset

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()

    model_feature_extract = transformers.AutoModelForSequenceClassification.from_pretrained(r"C:\Users\ivank\Documents\BERT_projects\BERT\results_45000\checkpoint-225000")
    #model_feature_extract = transformers.AutoModelForSequenceClassification.from_pretrained(r"C:\Users\ivank\Documents\BERT_projects\BERT\results_45000\checkpoint-180000")
    model_feature_extract.to(device)
    #print("Obtaining embeddings...")

    dataset.set_format('torch')
    global eval_dataloader
    eval_dataloader = DataLoader(dataset, shuffle=False, batch_size=30)

    eval_outputs = torch.Tensor()
    eval_outputs = eval_outputs.to(device)

    model_feature_extract.eval()
    with torch.no_grad():
        for i, batch in enumerate(eval_dataloader):
            #print(batch)
            #step = 10 if (len(eval_dataloader) < 100) else 100
            #if i % step == 0:
            #    print(">{} processing item {}/{}".format(int((i/len(eval_dataloader))*10)*">", i, len(eval_dataloader))) 

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model_feature_extract(**batch, output_hidden_states=True)
            cur_logits = outputs.logits
            eval_outputs = torch.cat((eval_outputs, cur_logits), 0)

    #eval_outputs = torch.reshape(eval_outputs, (len(eval_dataloader), 30, 768))

    #Save the text embedding for future analysis
    #global logits
    logits = eval_outputs

    predictions = []
    for prediction in logits:
        if mode == 'labels':
            prediction = torch.argmax(prediction, dim=-1)
            prediction = prediction.cpu().numpy()
        elif mode == 'probs':
            m = nn.Softmax()
            prediction = m(prediction)
            prediction = prediction.cpu().numpy()
            prediction = np.around(prediction, decimals=3)
            #prediction = prediction.tolist()
        else:
            prediction = prediction.cpu().numpy()
        predictions.append(prediction)
    #print("Done!")
    return np.array(predictions)

def get_data_from_listed_segments(text_or_list):
    """
    Get data from raw text that contains two fragments and a separater, or from a list of texts,
    each of them containing two fragments and a separater. Used in pipeline_onetext. The ONLY type
    of data processor for LIME inputs
    """

    print("Getting data from raw text")

    datas = []

    print(type(text_or_list), len(text_or_list))
    if not isinstance(text_or_list[0], str):
        for text_variant in text_or_list:
            #print(text_variant)
            text1, text2 = text_variant[0], text_variant[1]
            datas.append(([text1], [text2]))
    else:
        text1, text2 = text_or_list[0], text_or_list[1]
        datas.append(([text1], [text2]))
    return datas

def pipeline_twosegments(data_path, mode='probs'):

    print(data_path)
    
    segmented_data = get_data_from_listed_segments(data_path)
    dataset = datasets.Dataset.from_pandas(pd.DataFrame(segmented_data))
    del segmented_data

    print("Tokenization...")

    #only ititialize tokenizer if you don't do it before calling the function (which is faster)
    #global tokenizer 
    #tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
    tokenized_dataset = dataset.map(tokenize_function)
    #print(tokenizer.decode(tokenized_dataset[0]['input_ids'][0]))

    flat_dataset = tokenized_dataset.to_pandas()
    flat_dataset = flat_dataset.drop(['0', '1'], axis=1)
    if '2' in flat_dataset: #we may or may not have this column depending on the input type
         flat_dataset = flat_dataset.drop(['2'], axis=1)
    flat_dataset = flat_dataset.explode(['input_ids', 'token_type_ids', "attention_mask"]).reset_index(drop=True)
    dataset = datasets.Dataset.from_pandas(flat_dataset)

    global datacheck
    datacheck = flat_dataset

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    torch.cuda.empty_cache()

    model_feature_extract = transformers.AutoModelForSequenceClassification.from_pretrained(r"C:\Users\ivank\Documents\BERT_projects\BERT\results_45000\checkpoint-225000")
    #model_feature_extract = transformers.AutoModelForSequenceClassification.from_pretrained(r"C:\Users\ivank\Documents\BERT_projects\BERT\results_45000\checkpoint-180000")
    model_feature_extract.to(device)
    print("Obtaining embeddings...")

    dataset.set_format('torch')
    global eval_dataloader
    eval_dataloader = DataLoader(dataset, shuffle=False, batch_size=30)

    eval_outputs = torch.Tensor()
    eval_outputs = eval_outputs.to(device)

    model_feature_extract.eval()
    with torch.no_grad():
        for i, batch in enumerate(eval_dataloader):
            #print(batch)
            step = 10 if (len(eval_dataloader) < 100) else 100
            if i % step == 0:
                print(">{} processing item {}/{}".format(int((i/len(eval_dataloader))*10)*">", i, len(eval_dataloader))) 

            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model_feature_extract(**batch, output_hidden_states=True)
            cur_logits = outputs.logits
            eval_outputs = torch.cat((eval_outputs, cur_logits), 0)

    #eval_outputs = torch.reshape(eval_outputs, (len(eval_dataloader), 30, 768))

    #Save the text embedding for future analysis
    global logits
    logits = eval_outputs

    predictions = []
    for prediction in logits:
        if mode == 'labels':
            prediction = torch.argmax(prediction, dim=-1)
            prediction = prediction.cpu().numpy()
        elif mode == 'probs':
            m = nn.Softmax()
            prediction = m(prediction)
            prediction = prediction.cpu().numpy()
            prediction = np.around(prediction, decimals=3)
            #prediction = prediction.tolist()
        else:
            prediction = prediction.cpu().numpy()
        predictions.append(prediction)
    print("Done!")
    return np.array(predictions)

In [None]:
res = pipeline_twosegments([segments[0][0][0], segments[0][1][0]], mode='logits')

In [None]:
res = pipeline_onesegment(segm00.replace("[SEP]", "$&*&*&$"), mode='logits')
res[0]

Tokenization...


1ex [00:00, 499.86ex/s]


Obtaining embeddings...
> processing item 0/1
Done!


array([ 4.7861366, -4.759954 ], dtype=float32)

In [None]:
np.subtract(res[0], res[0])

array([0., 0.], dtype=float32)