# SETUP

In [37]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import torch
from tqdm import tqdm, trange
from transformers import AutoModel
from keras.preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer
import matplotlib.pyplot as plt

In [71]:
# GPU check
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


# FUNCTIONS

In [3]:
def get_embeddings_with_cpu(data_tensor, mask_tensor):
    embeddings = model.forward(input_ids=data_tensor,
        attention_mask=mask_tensor,
        head_mask=None)
    print(embeddings[0].shape)

    return embeddings[0]

In [4]:
"""
For dealing with large amounts of data, a GPU is much faster

The resulting embedding matrix is a three-dimensional tensor corresponding to                 [Sentence][Words][Embeddings]
embeddings[5][4][:] is thus the embedding of the fourth word in the fifth sentence
"""

def get_embeddings_with_gpu(data_matrix, mask_matrix):
    # Load the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #print(f"Using device: {torch.cuda.get_device_name(0)}")

    # Set the model to use the device
    model.cuda()

    # Move the data onto the GPU
    data_matrix = data_matrix.to(device)
    mask_matrix = mask_matrix.to(device)

    # Generate embeddings
    matrix_embedding = model.forward(input_ids=data_matrix,
        attention_mask=mask_matrix,
        head_mask=None)[0]
    #print(f"Embedding generated with shape {batch_embedding.shape}")

    # Make it an ordinary np array instead of a torch
    matrix_embedding = np.array(matrix_embedding.tolist())

    return matrix_embedding

#Keep amount of samples low to not overwhelm the gpu
#get_embeddings_with_gpu(data_tensor_matrix[:10], mask_tensor_matrix[:10])

In [5]:
"""
Most people won't be able to load all the data onto the GPU at once however, so it's better   to do it in batches.
(50 input sentences take 2803MB on my computer, for example).

This method batchifies and stitches together the batches
"""
def get_embeddings_with_gpu_batch(data_matrix, mask_matrix, batch_size):
    num_items = data_matrix.shape[0]
    num_loops = int(np.ceil(num_items/batch_size))

    start = 0
    end = batch_size
    data_holder = []

    for i in trange(num_loops):
        # Split the data into batches
        data_batch = data_matrix[start:end]
        mask_batch = mask_matrix[start:end]

        #Get the embedding for the batch
        batch_embedding = get_embeddings_with_gpu(data_batch, mask_batch)

        data_holder.append(batch_embedding)

        #Move to next batch
        start += batch_size
        end += batch_size

    # Merge the batches we've generated
    embedding_matrix = np.vstack(data_holder)

    print(f"Final embedding generated with shape {embedding_matrix.shape}")

    return embedding_matrix

#embedding_matrix = get_embeddings_with_gpu_batch(data_tensor_matrix[:200], mask_tensor_matrix[:200], 50)


In [6]:
"""
The input needs to be on the form of a dataframe with a column named 'Sentence'
Where each row consists of one sentence

One can also pass an ordinary string, this method formats it for use with the model
"""
def check_input_format(input_data):
    print("Checking input...")
    if isinstance(input_data, pd.DataFrame):
        if 'Sentence' in input_data:
            print("PASS")
            return input_data
        else:
            print("FAIL")
    elif isinstance(input_data, str):
        print("Converting sentence to DataFrame Object")
        sentence_df = pd.DataFrame()
        sentence_df['Sentence'] = [input_data]
        return sentence_df
    else:
        print("FAIL")

In [11]:
def count_unique_elements_in_2d_matrix(matrix):
    #Stop it from being a tensor
    x = np.array(matrix.tolist())
    
    #Make it 1D
    x = x.reshape(x.shape[0]*x.shape[1], )
    
    #Make it into a set
    x = set(x)
    
    #Count amount of unique elements in set
    return len(x)

In [12]:
"""
Split data into train and test sets by the given ratio.
Validation sets are not needed, those we get for free with keras models
"""
def split_data(percentage_to_train_on, input_data, output_data):
    ratio = percentage_to_train_on

    split = int(np.ceil(ratio*input_data.shape[0]))

    train_x = input_data[:split]
    train_y = output_data[:split]

    test_x = input_data[split:]
    test_y = output_data[split:]

    return train_x, train_y, test_x, test_y


# LOAD DATA

In [57]:
def load_data_from_start():
    #Load the data
    df = pd.read_pickle("../data/sentence_labels")

    #Work on a subset for testing purposes
    input_df = df[['Sentence']].head(20000).copy()
    label_df = df[['Labels']].head(20000).copy()
    return input_df, label_df

#input_df, label_df = load_data_from_start()

def load_data_from_start2():
    #Load the data
    sentences = pd.read_pickle("../data/sentence_ints").head(100)
    attentions = pd.read_pickle("../data/attention_ints").head(100)
    labels = pd.read_pickle("../data/label_ints").head(100)
    label_list = np.load("../data/label_list.npy")
    return sentences, attentions, labels, label_list

sentences, attentions, labels, label_list = load_data_from_start2()

def generate_embeddings():
    data_tensor_matrix, mask_tensor_matrix,_ = format_sentences_for_BERT(input_df)
    label_matrix = format_labels_for_BERT(label_df)
    embedding_matrix = get_embeddings_with_gpu_batch(data_tensor_matrix, mask_tensor_matrix, 50)
    
    return embedding_matrix, label_matrix

#embedding_matrix, label_matrix = generate_embeddings()

def load_embeddings():
    #np.save('../data/20000_embedding', embedding_matrix)
    #np.save('../data/20000_label', label_matrix)
    embedding_matrix = np.load('../data/20000_embedding.npy')
    label_matrix = np.load('../data/20000_label.npy')
    return embedding_matrix, label_matrix

#embedding_matrix, label_matrix = load_embeddings()

In [70]:
model = AutoModel.from_pretrained('KB/albert-base-swedish-cased-alpha')
tokenizer = AutoTokenizer.from_pretrained("KB/albert-base-swedish-cased-alpha")
tokenizer.add_tokens(label_list.tolist())
model.resize_token_embeddings(len(tokenizer))

Embedding(50030, 128)

In [28]:
#get_embeddings_with_gpu_batch(sentences, attentions, 50)

In [72]:
sentences

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,394,395,396,397,398,399,400,401,402,403
0,2,136,216,544,9085,72,2955,771,3711,23095,...,0,0,0,0,0,0,0,0,0,0
1,2,136,56,26997,2398,56,1282,1142,72,2980,...,0,0,0,0,0,0,0,0,0,0
2,2,49933,8,8231,104,187,1586,11425,54,618,...,0,0,0,0,0,0,0,0,0,0
3,2,49933,8,276,56,34405,107,419,920,721,...,0,0,0,0,0,0,0,0,0,0
4,2,3377,53,79,298,33,12716,49933,1,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,204,13013,2083,49933,8,588,100,35605,1229,...,0,0,0,0,0,0,0,0,0,0
96,2,49933,8,1045,2555,200,705,104,187,4266,...,0,0,0,0,0,0,0,0,0,0
97,2,335,34405,399,1087,4332,49933,8,132,803,...,0,0,0,0,0,0,0,0,0,0
98,2,276,33,1276,29039,93,640,1917,41,1917,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#Train/test split it
ratio = 0.8
train_x,train_y,test_x,test_y = split_data(ratio, embedding_matrix, label_matrix)

one_hot_encoding = pd.get_dummies(train_y.reshape(-1,))

#Create a dictionary key for the one-hot-encoding indices and labels
label_dict = {}
for i in range(one_hot_encoding.shape[1]):
    integer = int(one_hot_encoding.columns[i])
    label = tokenizer.decode(integer)
    label_dict.update({i:label})

In [34]:
input_df.iloc[0].Sentence

'I sin första reaktion på Sovjetledarens varningar deklarerade Litauens president Vytautas Landsbergis att " nu avvisar Gorbatjov vår utsträckta hand med extremt skarpa och hämndlystna ord " .'

In [32]:
label_matrix[0]

array([    2, 49984, 49984, 49984, 49984, 49984, 49984, 49984, 49984,
           1, 49984,     1, 49984, 49984, 49984, 49984,     1, 49984,
       49984, 49984, 49984, 49984, 49984, 49984, 49984, 49984, 49984,
       49984,     3,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0])

In [37]:
tokenizer.decode(3)

'[SEP]'

In [19]:
# One-hot encode the training data
train_y = np.array(pd.get_dummies(train_y.reshape(-1,))).reshape(train_y.shape[0], train_y.shape[1], -1)
test_y = np.array(pd.get_dummies(test_y.reshape(-1,))).reshape(test_y.shape[0], test_y.shape[1], -1)
print(train_y.shape, test_y.shape)
print(train_x.shape, test_x.shape)

(16000, 50, 9) (4000, 50, 9)
(16000, 50, 768) (4000, 50, 768)


In [20]:
pd.get_dummies(train_y.reshape(-1,))

Unnamed: 0,0,1
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
7199995,1,0
7199996,1,0
7199997,1,0
7199998,1,0


# TRAIN ALBERT

### Create classification layer

To ablate the fine-tuning approach, we apply the
feature-based approach by extracting the activations from one or more layers without fine-tuning
any parameters of BERT. These contextual embeddings are used as input to a randomly initialized two-layer 768-dimensional BiLSTM before
the classification layer.



In [21]:
# Define a simple baseline model
from keras.models import Sequential
from keras.layers import Dense

#input_dim = input_data.shape[1]
#output_dim = output_data.shape[1]

In [22]:
def linear_model():
    # create model
    model = Sequential()
    model.add(
        Dense(8, input_dim=input_dim, activation='relu'))
    model.add(
        Dense(output_dim, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [23]:
from keras.layers import LSTM, Bidirectional, TimeDistributed, Dropout
from keras.models import Model, Input
def naive_bilstm_model():   
    #Define the input shape. Each datapoint is a sentence consisting of seqlength words, each word 784dims
    input = Input(shape=(train_x.shape[1], train_x.shape[2]))
    # Pass it through a bidirectional lstm
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(input)
            
    # add a timedistributed layer
    out = TimeDistributed(Dense(train_y.shape[2], activation="softmax"))(model)  # softmax output layer
    
    model = Model(input, out)
    
    # Compile it
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    print(model.summary())
    return model


In [24]:
# Train the model, setting aside 10% data for validation
model = naive_bilstm_model()
def train_model():
    history = model.fit(
        train_x,
        train_y,
        epochs=1,
        batch_size=50,
        validation_split=0.1)
    return history

history = train_model()
print('\nhistory dict:', history.history.keys())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50, 768)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 200)           695200    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 50, 9)             1809      
Total params: 697,009
Trainable params: 697,009
Non-trainable params: 0
_________________________________________________________________
None
Train on 14400 samples, validate on 1600 samples
Epoch 1/1

history dict: dict_keys(['val_loss', 'val_accuracy', 'loss', 'accuracy'])


In [25]:
_,_, sentence_data = format_sentences_for_BERT(input_df)

Checking input...
PASS


In [30]:
i = 1
sentence_embedding = embedding_matrix[i]
sentence_embedding = sentence_embedding.reshape(1, sentence_embedding.shape[0], sentence_embedding.shape[1])
sentence_predicitons = model.predict(sentence_embedding)
sentence_predicitons = sentence_predicitons.reshape(sentence_predicitons.shape[1], sentence_predicitons.shape[2])
for prediction in range(sentence_predicitons.shape[0]):
    predicted_class = np.argmax(sentence_predicitons[prediction])
    print(predicted_class)
    
    predicted_class = tokenizer.decode(int(predicted_class))
    word = tokenizer.decode(int(sentence_data['Input'].iloc[i][prediction]))
    print(predicted_class, word)


2
[CLS] [CLS]
8
- I
8
- en
8
- ruta
8
- talar
8
- en
8
- kort
8
- rad
8
- på
8
- ryska
8
- om
8
- att
8
- de
8
- port
8
- ation
8
- har
8
- förekommit
8
- ,
8
- en
8
- siff
8
- erk
8
- omb
8
- ination
8
- förklarar
8
- graden
0
<pad> av
0
<pad> 
0
<pad> "
0
<pad> brottslighet
0
<pad> 
0
<pad> "
0
<pad> 
0
<pad> .
0
<pad> [SEP]
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>
0
<pad> <pad>


In [27]:
i = 0
sentence_tokens = sentence_data[['Input']].iloc[i]
for token in sentence_tokens:
    print(tokenizer.decode(token))

#Get the sentence we are testing on.
#For each token, get the prediction


[CLS] I sin första reaktion på Sovjetledarens varningar deklarerade Litauens president Vytautas Landsbergis att " nu avvisar Gorbatjov vår utsträckta hand med extremt skarpa och hämndlystna ord ".[SEP]<pad><pad><pad><pad><pad><pad>


In [28]:

    
    #Classify it
    c = base_model.predict(n)
    
    cols = ['word', 'entity', 'probability']

    pretty_output_df = pd.DataFrame(columns={'word', 'entity', 'probability'})
    
    #Start from 1 to skip the cls token
    #Print the predictions 
    
    predictions = c[0]

    #Take the index of the most likely word. This correspond to the class
    index = np.argmax(predictions)

    #Translate this index to a label
    #label = tokenizer.decode()

    #Grab the certainity
    certainity = predictions[index]
    
    pretty_output_df = pretty_output_df.append(pd.Series(['[CLS]', label_dict.get(index), certainity], index = cols), ignore_index=True)

    for i in range(len(word_list)+4):
        
        
        #Get the predictions for the first word, skip the cls token
        predictions = c[i+1]
        
        #Take the index of the most likely word. This correspond to the class
        index = np.argmax(predictions)
        
        #Translate this index to a label
        #label = tokenizer.decode()
        
        #Grab the certainity
        certainity = predictions[index]
        
        #Check which word this is
        if i <len(word_list):
            word = word_list[i]
        else:
            word = "[P A D]"
            
        pretty_output_df = pretty_output_df.append(pd.Series([word, label_dict.get(index), certainity], index = cols), ignore_index=True)   
    return pretty_output_df


NameError: name 'base_model' is not defined

In [None]:
z = pd.DataFrame(columns=["Word", "Prediction"])
z

In [None]:
def create_history_df(history):
    training_df = pd.DataFrame()
    for metric in history.history.keys():
        training_df[metric] = history.history[metric]
    return training_df

history_df = create_history_df(history)

In [None]:
sns.set()
fig, ax = plt.subplots(figsize=(15,10))
history_df.plot(ax=ax)
plt.ylabel('Metric')
plt.xlabel('Epoch')
plt.show()

# TEST ALBERT

In [None]:
print('\n# Evaluate on test data')
results = model.evaluate(test_x, test_y)
for i, metric in enumerate(model.metrics_names):
    print(f"{metric}: {results[i]}")

In [None]:
s = "En enkel mening skriven av Victor från Stockholm, i förhoppningen om att KTH skall upptäckas som en institution."
x = NER(s, model)

In [None]:
p = model.predict(np.array([test_x[1]]))
p = np.argmax(p, axis=-1)
p

In [None]:
i = 1
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
print("{:14} ({:5}): {}".format("Word", "True", "Pred"))
for w,pred in zip(X_test[i],p[0]):
    print("{:14}: {}".format(words[w],tags[pred]))

# Setup

# Setup

# Setup

# Setup