In [1]:
from transformers import AutoModel
from transformers import AutoTokenizer
from embedding_functions import *
import pandas as pd
import torch

# Load the model and tokenizer

model = AutoModel.from_pretrained('KB/albert-base-swedish-cased-alpha')
tokenizer = AutoTokenizer.from_pretrained("KB/albert-base-swedish-cased-alpha")

#Load the label dictionary
label_dictionary = np.load("../data/dummy_index_label_dict.npy", allow_pickle=True).tolist()
cf_labels = list(label_dictionary.values())

#Update the tokenizer with our labels (which do not exist in the normal vocabulary)
tokenizer.add_tokens(cf_labels)

#Inform the model that we've updated the vocab
model.resize_token_embeddings(len(tokenizer))

Embedding(50030, 128)

In [2]:
def format_single_sentence(s):
    """
    Takes a string and formats it for use with the ALBERT model used in this project
    Also returns the attention mask
    """
    pad_symbol = tokenizer.encode('<pad>', add_special_tokens=False)
    input_array = pad_symbol*100

    # Encode our sentence
    encoding = tokenizer.encode(s)

    #Overwrite our dummy array
    input_array[:len(encoding)] = encoding

    #Adjust dimensionality, make it a tensor
    input_tensor = torch.tensor(np.array(input_array).reshape(1,-1))
    
    attention = np.array(input_array) != 0
    attention_tensor = torch.tensor(attention.reshape(1,-1))
    return input_tensor, attention_tensor
    
#inp, att = format_single_sentence('hi')

In [3]:
def classify_single_sentence(sentence_tensor, attention_tensor):
    """
    Given input formatted for use with the ALBERT model, generates embeddings and uses them with a classifier to generate prediction for each original word
    Returns a dataframe containing token, predicted token, and certainity/probability
    """
    embedding = get_embeddings_with_gpu_batch(sentence_tensor, attention_tensor, 1)
    
    #Predict using the embeddings
    prediction_probabilities = classifier.predict(embedding)

    #Given these probabilities, get the predicted class for each word
    predicted_labels = remap(prediction_probabilities, label_dictionary)

    #Get the confidence for our predictions as well
    certainity = prediction_probabilities.max(axis=2)

    #Get the input token array we used
    sentence_tokens = tokenizer.convert_ids_to_tokens(np.array(sentence_tensor).reshape(-1,))

    #Put it all into a dataframe for an easy-to-read output
    pretty_df = pd.DataFrame(columns = ['Word', 'Predicted_Label', 'Certainity'])

    pretty_df['Word'] = sentence_tokens
    pretty_df['Predicted_Label'] = predicted_labels.T
    pretty_df['Certainity'] = certainity.T
    
    return pretty_df

In [4]:
def NER(sentence):
    """
    Shorthand function for taken any sentence and tesing the model/classifier on it
    """
    sentence_tensor, attention_tensor = format_single_sentence(sentence)
    results = classify_single_sentence(sentence_tensor, attention_tensor)
    return results

In [5]:
from keras.layers import LSTM, Bidirectional, TimeDistributed, Dropout, Dense
from keras.models import Model, Input
def bilstm_model():
    """
    The model selection was very basic. Testing was performed on the same 5000 sentences with number of units = 100, 200, 300, 400
    Each subsequent increase in units massivly increased complexity for minor gains in performance.
    10% of the data was used for validation
    
    """
    #Define the input shape. Each datapoint is a sentence consisting of seqlength words, each word 784dims
    input = Input(shape=(100, 768))
    # Pass it through a bidirectional lstm
    model = Bidirectional(LSTM(units=200, return_sequences=True, recurrent_dropout=0.1))(input)

    # add a timedistributed layer
    out = TimeDistributed(Dense(33, activation="softmax"))(model)  # softmax output layer
    
    model = Model(input, out)
    
    # Compile it
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    print(model.summary())
    return model


Using TensorFlow backend.


In [6]:
def remap(matrix, label_dictionary):
    """
    Utility function to take one-hot encoded labels and return the label with the higest probability
    """
    
    #Switch index to label
    def switcher(x):
        return label_dictionary[x]
    
    #Get the index along the second axis (the one-hot encoding) with the highest value
    predicted_index = pd.DataFrame(np.argmax(matrix, axis=2))
    
    #Use our dictionary to map this index to the class
    label_representation = predicted_index.applymap(switcher)

    return label_representation

In [7]:

classifier = bilstm_model()

# load weights into new model
classifier.load_weights("../data/test_weights.h5")

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100, 768)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 400)          1550400   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 33)           13233     
Total params: 1,563,633
Trainable params: 1,563,633
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
s = "Oscuar Oscuarsson är en student på KTH sedan 2020, han studerar i Stockholm och arbetar med NER sedan några månader tillbaka"
#s = ""

df = NER(s).head(30)

  0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: cuda runtime error (999) : unknown error at /pytorch/aten/src/THC/THCGeneral.cpp:50

In [None]:
print(df.to_latex(index=False))