In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [2]:
#Load the data
df = pd.read_pickle("../data/sentence_labels")

#Work on a subset for testing purposes
input_df = df[['Sentence']].head(20000).copy()
label_df = df[['Labels']].head(20000).copy()



## Formatting the input and labels

In [3]:
# Format the input
from keras.preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer
import torch

Using TensorFlow backend.


In [4]:
#Load swedish tokenizer
tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased-ner")

In [5]:
# Use WORDPIECE tokenization
#input_df['Tokenized'] = input_df[['Sentence']].apply(lambda x: tokenizer.tokenize(x[0]), axis=1)
# Ensure no named entities have been split apart

In [6]:
# use WORD tokenization
input_df['Tokenized'] = input_df[['Sentence']].apply(lambda x: x[0], axis=1)
label_df['Tokenized'] = label_df[['Labels']].apply(lambda x: x[0], axis=1)

In [7]:
# Replace words with integers, add the special [CLS] and [SEP] tokens
input_df['Integerized'] = input_df[['Tokenized']].apply(lambda x: tokenizer.encode(x[0], add_special_tokens=True), axis=1)
label_df['Integerized'] = label_df[['Tokenized']].apply(lambda x: tokenizer.encode(x[0], add_special_tokens=True), axis=1)

In [8]:
# Pad and truncate all sentences so they are the same length
length = 50
input_df['Input'] = input_df[['Integerized']].apply(lambda x: pad_sequences(x, maxlen=length, dtype="long", truncating="post", padding="post")[0], axis=1)
label_df['Output_Class'] = label_df[['Integerized']].apply(lambda x: pad_sequences(x, maxlen=length, dtype="long", truncating="post", padding="post")[0], axis=1)

In [9]:
# Add attention mask. Attention is 0 for padding, else 1
input_df['Attention_Mask'] = input_df[['Input']].apply(lambda x: (x[0] != 0).astype(int), axis=1)

In [10]:
#Sanity checks for Labels
def check_labels(index):
    for column in label_df:
        print(column)
        print(label_df[[column]].iloc[index][0])
        print(f"Length: {len(label_df[[column]].iloc[index][0])}")
        print()
    
# check_labels(0)

In [11]:
#Sanity checks for sentences
def check_sentence(index):
    for column in input_df:
        print(column)
        print(input_df[[column]].iloc[index][0])
        print(f"Length: {len(input_df[[column]].iloc[index][0])}")

        print()
    
# check_sentence(0)

In [12]:
# To prepare a single sentence
index = 0

#Convert lists torch tensors
data_tensor = torch.tensor(input_df[['Input']].iloc[index][0]).unsqueeze(0)
mask_tensor = torch.tensor(input_df[['Attention_Mask']].iloc[index][0]).unsqueeze(0)

#This doesn't need to be a tensor since the labels will be used with the linear classifier outside ALBERT
tags_output = label_df[['Output_Class']].iloc[index][0]

In [13]:
# To prepare all sentences

# Convert the list of lists into tensors
data_matrix = torch.tensor([x[0] for x in input_df[['Input']].values])
mask_matrix = torch.tensor([x[0] for x in input_df[['Attention_Mask']].values])

#This doesn't need to be a tensor since the labels will be used with the linear classifier outside ALBERT
tags_matrix = np.array([x[0] for x in label_df[['Output_Class']].values])


## Generate Embeddings

In [14]:
#Load model
from transformers import AutoModel
model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')


In [15]:
"""
This function takes a single input sentence and mask and generates an embedding for all the tokens in the sentence, using the cpu
"""
def get_embeddings_with_cpu(data_tensor, mask_tensor):
    embeddings = model.forward(input_ids=data_tensor,
        attention_mask=mask_tensor,
        head_mask=None)
    print(embeddings[0].shape)
    
    return embeddings[0]
    

In [17]:
"""
Getting the embeddings for all the data with a cpu is possible.
But it takes a lot of time.
Using a GPU is faster.
But you have to be careful so you dont go beyond the memory your card can handle.
50 input sentences takes 2803MB on my computer.
So, we split the input into batches and regenerate it into a complete embedding matrix afterwards

        # The embedding matrix is a three-dimensional tensor corresponding to Sentence, Words, Embeddings
        # embeddings[5][4][:] is thus the embedding of the fourth word in the fifth sentence

"""
def get_embeddings_with_gpu(data_matrix, mask_matrix, batch_size):
    # Load the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {torch.cuda.get_device_name(0)}")

    # Set the model to use the device
    model.cuda()
    
    num_items = data_matrix.shape[0]
    num_loops = int(num_items/batch_size) + 1
    
    start = 0
    end = batch_size
    
    data_holder = []
    
    
    
    #while end <= num_items:
    for i in trange(num_loops):
        #print(f"Working on items {start} to {end} out of {num_items}")
        
        # Split the data into batches
        data_batch = data_matrix[start:end]
        mask_batch = mask_matrix[start:end]
        
        # Move the data onto the GPU
        data_batch = data_batch.to(device)
        mask_batch = mask_batch.to(device)
        
        # Generate the embeddings for this batch
        batch_embedding = model.forward(input_ids=data_batch,
            attention_mask=mask_batch,
            head_mask=None)[0]
        #print(f"Embedding generated with shape {batch_embedding.shape}")
    
        # Make it an ordinary np array instead of a torch
        batch_embedding = np.array(batch_embedding.tolist())
        data_holder.append(batch_embedding)
        
        #Move to next batch
        start += batch_size
        end += batch_size
    
    
    # Merge the batches we've generated
    embedding_matrix = np.array(data_holder).reshape(num_items, length, -1)
    print(f"Final embedding generated with shape {embedding_matrix.shape}")

    return embedding_matrix


embedding_matrix = get_embeddings_with_gpu(data_matrix, mask_matrix, 20)

Using device: GeForce GTX 1050 Ti


100%|██████████| 1001/1001 [02:40<00:00,  6.23it/s]


ValueError: cannot reshape array of size 1001 into shape (20000,50,newaxis)

In [None]:
np.array(embedding_matrix).shape

In [None]:


#Format the labels
tags_matrix = np.array([x[0] for x in label_df[['Output_Class']].values])
print(tags_matrix.shape)

## Train the classifier on the embeddings

In [None]:
from sklearn.linear_model import LogisticRegression

embeddings = embedding_matrix.reshape(10000, -1)
entities = tags_matrix.reshape(10000,)

print(embeddings.shape)
print(entities.shape)
X = embeddings
y = entities

clf = LogisticRegression(random_state=0).fit(X, y)


# Evaluate