In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [129]:
#Load the data
df = pd.read_pickle("../data/sentence_labels")
input_df = df[['Sentence']].head().copy()
label_df = df[['Labels']].head().copy()

In [175]:
# Format the input
from keras.preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer
import torch

#Load swedish tokenizer
tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased-ner")

# Use wordpiece tokenization
input_df['Tokenized'] = input_df[['Sentence']].apply(lambda x: tokenizer.tokenize(x[0]), axis=1)

# Replace words with integers, add the special [CLS] and [SEP] tokens
input_df['Integerized'] = input_df[['Tokenized']].apply(lambda x: tokenizer.encode(x[0], add_special_tokens=True), axis=1)

# Pad and truncate all sentences so they are the same length
length = 50
input_df['Input'] = input_df[['Integerized']].apply(lambda x: pad_sequences(x, maxlen=length, dtype="long", truncating="post", padding="post")[0], axis=1)

# Add attention mask. Attention is 0 for padding, else 1
input_df['Attention_Mask'] = input_df[['Input']].apply(lambda x: (x[0] != 0).astype(int), axis=1)

# The model only accepts Tensor, so convert the Padded Input
#input_df['Input_Tensor'] = input_df[['Padded']].apply(lambda x: torch.tensor(x[0]).unsqueeze(0), axis=1)

# The model only accepts Tensor, so convert the Mask
#input_df['Attention_Tensor'] = input_df[['Attention_Mask']].apply(lambda x: torch.tensor(x[0]).unsqueeze(0), axis=1)

#Sanity checks for sentences
def check_sentence(index):
    for column in input_df:
        print(column)
        print(input_df[[column]].iloc[index][0])
        print()
    
#check_sentence(0)

In [177]:
# Format the Labels

# Replace words with integers, add the special [CLS] and [SEP] tokens
label_df['Integerized'] = label_df[['Labels']].apply(lambda x: tokenizer.encode(x[0], add_special_tokens=True), axis=1)




label_df

Unnamed: 0,Labels,Integerized
0,"[O, O, O, O, O, O, O, O, LOC, O, PRS, O, O, O,...","[2, 167, 167, 167, 167, 167, 167, 167, 167, 1,..."
1,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[2, 167, 167, 167, 167, 167, 167, 167, 167, 16..."
2,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, TME...","[2, 167, 167, 167, 167, 167, 167, 167, 167, 16..."
3,"[O, O, O, O, O, O, O, O, O, PRS, O, O, O, O, O...","[2, 167, 167, 167, 167, 167, 167, 167, 167, 16..."
4,"[O, O, O, O, O, LOC, O]","[2, 167, 167, 167, 167, 167, 1, 167, 3]"


In [171]:
#Load model
from transformers import AutoModel
model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')

In [172]:
# Run model on a single sentence

# Convert the lists into tensors
index = 0
#Convert to torch tensors
data_tensor = torch.tensor(input_df[['Input']].iloc[index][0]).unsqueeze(0)
mask_tensor = torch.tensor(input_df[['Attention_Mask']].iloc[index][0]).unsqueeze(0)

embeddings = model.forward(input_ids=data_tensor,
    attention_mask=mask_tensor,
    head_mask=None)

print(embeddings[0].shape)

torch.Size([1, 50, 768])


In [173]:
# Run model on ALL sentences

# Convert the list of lists into tensors
data_matrix = torch.tensor([x[0] for x in input_df[['Input']].values])
mask_matrix = torch.tensor([x[0] for x in input_df[['Attention_Mask']].values])

embeddings = model.forward(input_ids=data_matrix,
    attention_mask=mask_matrix,
    head_mask=None)
print(embeddings[0].shape)

torch.Size([5, 50, 768])


In [None]:
#Add BIO-format to labels
#Add linear classifier