In [1]:
from transformers import AutoModel
from transformers import AutoTokenizer
from embedding_functions import *
import pandas as pd
import torch

# Load the model and tokenizer

model = AutoModel.from_pretrained('KB/albert-base-swedish-cased-alpha')
tokenizer = AutoTokenizer.from_pretrained("KB/albert-base-swedish-cased-alpha")

#Load the label dictionary
label_dictionary = np.load("../data/dummy_index_label_dict.npy", allow_pickle=True).tolist()
cf_labels = list(label_dictionary.values())

#Update the tokenizer with our labels (which do not exist in the normal vocabulary)
tokenizer.add_tokens(cf_labels)

#Inform the model that we've updated the vocab
model.resize_token_embeddings(len(tokenizer))

Embedding(50030, 128)

In [2]:
def format_single_sentence(s):
    """
    Takes a string and formats it for use with the ALBERT model used in this project
    Also returns the attention mask
    """
    pad_symbol = tokenizer.encode('<pad>', add_special_tokens=False)
    input_array = pad_symbol*100

    # Encode our sentence
    encoding = tokenizer.encode(s)

    #Overwrite our dummy array
    input_array[:len(encoding)] = encoding

    #Adjust dimensionality, make it a tensor
    input_tensor = torch.tensor(np.array(input_array).reshape(1,-1))
    
    attention = np.array(input_array) != 0
    attention_tensor = torch.tensor(attention.reshape(1,-1))
    return input_tensor, attention_tensor
    
#inp, att = format_single_sentence('hi')

In [3]:
def classify_single_sentence(sentence_tensor, attention_tensor):
    """
    Given input formatted for use with the ALBERT model, generates embeddings and uses them with a classifier to generate prediction for each original word
    Returns a dataframe containing token, predicted token, and certainity/probability
    """
    embedding = get_embeddings_with_gpu_batch(sentence_tensor, attention_tensor, 1)
    
    #Predict using the embeddings
    prediction_probabilities = classifier.predict(embedding)

    #Given these probabilities, get the predicted class for each word
    predicted_labels = remap(prediction_probabilities, label_dictionary)

    #Get the confidence for our predictions as well
    Probability = prediction_probabilities.max(axis=2)

    #Get the input token array we used
    sentence_tokens = tokenizer.convert_ids_to_tokens(np.array(sentence_tensor).reshape(-1,))

    #Put it all into a dataframe for an easy-to-read output
    pretty_df = pd.DataFrame(columns = ['Word', 'Predicted_Label', 'Probability'])

    pretty_df['Word'] = sentence_tokens
    pretty_df['Predicted_Label'] = predicted_labels.T
    pretty_df['Probability'] = Probability.T
    
    return pretty_df

In [4]:
def NER(sentence):
    """
    Shorthand function for taken any sentence and tesing the model/classifier on it
    """
    sentence_tensor, attention_tensor = format_single_sentence(sentence)
    results = classify_single_sentence(sentence_tensor, attention_tensor)
    
    sentence_end = results[results['Word'] == '[SEP]'].index[0]
    return results.iloc[1:sentence_end]


In [5]:
from keras.layers import LSTM, Bidirectional, TimeDistributed, Dropout, Dense
from keras.models import Model, Input
def bilstm_model():
    """
    The model selection was very basic. Testing was performed on the same 5000 sentences with number of units = 100, 200, 300, 400
    Each subsequent increase in units massivly increased complexity for minor gains in performance.
    10% of the data was used for validation
    
    """
    #Define the input shape. Each datapoint is a sentence consisting of seqlength words, each word 784dims
    input = Input(shape=(100, 768))
    # Pass it through a bidirectional lstm
    model = Bidirectional(LSTM(units=200, return_sequences=True, recurrent_dropout=0.1))(input)

    # add a timedistributed layer
    out = TimeDistributed(Dense(33, activation="softmax"))(model)  # softmax output layer
    
    model = Model(input, out)
    
    # Compile it
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    print(model.summary())
    return model


Using TensorFlow backend.


In [6]:
def remap(matrix, label_dictionary):
    """
    Utility function to take one-hot encoded labels and return the label with the higest probability
    """
    
    #Switch index to label
    def switcher(x):
        return label_dictionary[x]
    
    #Get the index along the second axis (the one-hot encoding) with the highest value
    predicted_index = pd.DataFrame(np.argmax(matrix, axis=2))
    
    #Use our dictionary to map this index to the class
    label_representation = predicted_index.applymap(switcher)

    return label_representation

In [7]:
def save_as_latex(df):
    """
    Takes a dataframe, converts it to latex and saves it
    """
    #save to file
    caption = 'The predicted labels and corresponding probability for each token in a sentence'
    
    name = df['Word'][1] + df['Word'][2] + list(df['Word'])[-2]
    
    label = name + 'sentence'
    sentence_table = df.to_latex(index=False, float_format="%.3f", caption=caption, label = label)
    
    start = '\\begin{table}' + '[H]'
    
    end = sentence_table[13:]
    with open(f'../data/assets/{name}.tex','w+') as tf:
        tf.write(start+end)
    print("saved!")
    return start+end

In [8]:

classifier = bilstm_model()

# load weights into new model
classifier.load_weights("../data/cross_val/test_weights.h5")

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100, 768)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 400)          1550400   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 33)           13233     
Total params: 1,563,633
Trainable params: 1,563,633
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
def latexformatter(x):
    """
    Can't have [] in latex table rows apparently
    """
    if x == '[SEP]':
        return '{[SEP]}'
    if x == '[CLS]':
        return '{[CLS]}'
    if x == '<pad>':
        return '{[PAD]}'
    return x

In [10]:
s = "Oscuar Oscuarsson har studerat på KTH, Stockholm, sedan några månader tillbaka"
df = NER(s).head(20)
save_as_latex(df)
df

100%|██████████| 1/1 [00:00<00:00, 20.29it/s]

Final embedding generated with shape (1, 100, 768)





saved!


Unnamed: 0,Word,Predicted_Label,Probability
1,▁Osc,B-PRS,0.777881
2,uar,I-PRS,0.80508
3,▁Osc,I-PRS,0.830014
4,uar,I-PRS,0.934756
5,sson,I-PRS,0.960764
6,▁har,B-O,0.999869
7,▁studerat,B-O,0.995578
8,▁på,B-O,0.972232
9,▁KTH,B-ORG,0.946688
10,",",B-O,0.774796


In [11]:
s = "Ericsson hade arbetat på Ericsson länge"

df = NER(s)
save_as_latex(df)
df

100%|██████████| 1/1 [00:00<00:00, 37.94it/s]


Final embedding generated with shape (1, 100, 768)
saved!


Unnamed: 0,Word,Predicted_Label,Probability
1,▁Ericsson,B-PRS,0.878125
2,▁hade,B-O,0.995912
3,▁arbetat,B-O,0.99993
4,▁på,B-O,0.998396
5,▁Ericsson,B-ORG,0.707539
6,▁länge,B-O,0.977098


In [12]:
s = "De tävlade i French Open, Frankrike, och vann"
df = NER(s)
save_as_latex(df)
df

100%|██████████| 1/1 [00:00<00:00, 37.38it/s]

Final embedding generated with shape (1, 100, 768)
saved!





Unnamed: 0,Word,Predicted_Label,Probability
1,▁De,B-O,0.998481
2,▁tävlade,B-O,0.996324
3,▁i,B-O,0.98868
4,▁Fren,B-EVN,0.765089
5,ch,I-EVN,0.690283
6,▁Open,I-EVN,0.807067
7,",",B-O,0.975109
8,▁Frankrike,B-LOC,0.742382
9,",",B-O,0.847925
10,▁och,B-O,0.96684


In [13]:
s = "De tävlade i French Open, Frankrike"

df = NER(s)
save_as_latex(df)
df

100%|██████████| 1/1 [00:00<00:00, 35.49it/s]

Final embedding generated with shape (1, 100, 768)





saved!


Unnamed: 0,Word,Predicted_Label,Probability
1,▁De,B-O,0.998268
2,▁tävlade,B-O,0.994389
3,▁i,B-O,0.996378
4,▁Fren,B-ORG,0.357428
5,ch,I-ORG,0.384866
6,▁Open,I-EVN,0.307438
7,",",B-O,0.71778
8,▁Frankrike,B-O,0.651576


# Test sentence from SUC

In [14]:
from data_functions import *

In [15]:
def test_sentence(index, tokenizer, classifier, label_dictionary):
    select = index
    
    #Get the tokenized sentence and turn it back into words
    sentence = np.array(np.load('../data/sentence_tokens.npy')[select])
    sentence_tokens = np.array(tokenizer.convert_ids_to_tokens(sentence))
    
    # Get the ground truth labels
    one_hot_truth = get_label_file()[select].reshape(1,100,-1)
    gt_labels = remap(one_hot_truth, label_dictionary)
    
    #Get the embedding matrix we generated for the sentence
    prediction_probabilities = classifier.predict(get_embed_file()[select].reshape(1,100,-1))
    predicted_labels = remap(prediction_probabilities, label_dictionary)

    #Get the confidence for our predictions as well
    Probability = prediction_probabilities.max(axis=2)

    #Put it all into a dataframe for an easy-to-read output
    pretty_df = pd.DataFrame(columns = ['Word', 'Predicted_Label', 'True_Label', 'Probability'])

    pretty_df['Word'] = sentence_tokens
    pretty_df['Predicted_Label'] = predicted_labels.T
    pretty_df['True_Label'] = gt_labels.T
    pretty_df['Probability'] = Probability.T

    return pretty_df.head(25)


In [16]:
select = 20010

test_sentence(select, tokenizer, classifier, label_dictionary)


Unnamed: 0,Word,Predicted_Label,True_Label,Probability
0,[CLS],[CLS],[CLS],0.999949
1,▁Av,B-O,B-O,0.998326
2,▁återstoden,B-O,B-O,0.982834
3,▁faller,B-O,B-O,0.999754
4,▁merparten,B-O,B-O,0.988571
5,▁på,B-O,B-O,0.99771
6,▁Japan,B-LOC,B-LOC,0.996925
7,▁och,B-O,B-O,0.986228
8,▁Sydostasien,B-LOC,B-LOC,0.988899
9,▁,B-O,B-O,0.999043
