In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from data_functions import *

Using TensorFlow backend.


In [2]:
#Load the data
df = pd.read_pickle("../data/sentence_labels")

#Work on a subset for testing purposes
input_df = df[['Sentence']].head(2000).copy()
label_df = df[['Labels']].head(2000).copy()

## Formatting the input and labels

In [3]:
data_tensor_matrix, mask_tensor_matrix = format_sentences_for_BERT(input_df)
label_matrix = format_labels_for_BERT(label_df)

Checking input...
PASS


## Generate Embeddings

In [4]:
#Load model
from transformers import AutoModel
model = AutoModel.from_pretrained('KB/bert-base-swedish-cased')

In [5]:
"""
This function takes a single input sentence and mask and generates an embedding for all the tokens in the sentence, using the cpu
"""
def get_embeddings_with_cpu(data_tensor, mask_tensor):
    embeddings = model.forward(input_ids=data_tensor,
        attention_mask=mask_tensor,
        head_mask=None)
    print(embeddings[0].shape)
    
    return embeddings[0]
#embedding_matrix = get_embeddings_with_cpu(data_matrix[:10], mask_matrix[:10])


In [6]:
"""
For dealing with large amounts of data, a GPU is much faster

The resulting embedding matrix is a three-dimensional tensor corresponding to [Sentence][Words][Embeddings]
embeddings[5][4][:] is thus the embedding of the fourth word in the fifth sentence
"""

def get_embeddings_with_gpu(data_matrix, mask_matrix):
    # Load the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #print(f"Using device: {torch.cuda.get_device_name(0)}")

    # Set the model to use the device
    model.cuda()

    # Move the data onto the GPU
    data_matrix = data_matrix.to(device)
    mask_matrix = mask_matrix.to(device)

    # Generate embeddings
    matrix_embedding = model.forward(input_ids=data_matrix,
        attention_mask=mask_matrix,
        head_mask=None)[0]
    #print(f"Embedding generated with shape {batch_embedding.shape}")

    # Make it an ordinary np array instead of a torch
    matrix_embedding = np.array(matrix_embedding.tolist())

    return matrix_embedding

#embedding_matrix = get_embeddings_with_gpu(data_matrix[:10], mask_matrix[:10])


In [7]:
"""
Most people won't be able to load all the data onto the GPU at once however, so it's better to do it in batches.
(50 input sentences take 2803MB on my computer, for example).

This method batchifies and stitches together the batches 
"""
def get_embeddings_with_gpu_batch(data_matrix, mask_matrix, batch_size):
    num_items = data_matrix.shape[0]
    num_loops = int(np.ceil(num_items/batch_size))
    
    start = 0
    end = batch_size
    data_holder = []
    
    for i in trange(num_loops):        
        # Split the data into batches
        data_batch = data_matrix[start:end]
        mask_batch = mask_matrix[start:end]
        
        #Get the embedding for the batch
        batch_embedding = get_embeddings_with_gpu(data_batch, mask_batch)

        data_holder.append(batch_embedding)
        
        #Move to next batch
        start += batch_size
        end += batch_size
    
    # Merge the batches we've generated
    embedding_matrix = np.vstack(data_holder)

    print(f"Final embedding generated with shape {embedding_matrix.shape}")

    return embedding_matrix


embedding_matrix = get_embeddings_with_gpu_batch(data_tensor_matrix, mask_tensor_matrix, 50)
print(embedding_matrix.shape)

  0%|          | 0/40 [00:00<?, ?it/s]


RuntimeError: cuda runtime error (999) : unknown error at /pytorch/aten/src/THC/THCGeneral.cpp:50

In [None]:
input_data = format_input_for_NER(embedding_matrix)
output_data, label_dict = format_output_for_NER(label_matrix)

In [None]:
# Some last checks on the data
num_dimensions = input_data.shape[1]
num_classes = output_data.shape[1]

## Split into Train and Test

In [None]:
ratio = 0.8
train_x,train_y,test_x,test_y = split_data(ratio, input_data, output_data)


## Train the classifier on the embeddings

In [None]:
# Define a simple baseline model
from keras.models import Sequential
from keras.layers import Dense
def baseline_model():
    # create model
    model = Sequential()
    model.add(
        Dense(8, input_dim=num_dimensions, activation='relu'))
    model.add(
        Dense(num_classes, activation='softmax'))
    
    # Compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [None]:
base_model = baseline_model()

In [None]:
# Train the model, setting aside 20% data for validation
history = base_model.fit(
    train_x,
    train_y,
    epochs=20,
    batch_size=10,
    validation_split=0.2)

print('\nhistory dict:', history.history.keys())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

training_df = pd.DataFrame()
training_df['Accuracy'] = history.history['accuracy']
training_df["Validation Accuracy"] = history.history['val_accuracy']
training_df['Loss'] = history.history['loss']
training_df["Val Loss"] = history.history['val_loss']


from datetime import date

HISTORY_NAME = "history_" + str(date.today()) + "_epochs_" +str(training_df.shape[0])
HISTORY_NAME

# Save the history object for posterity
training_df.to_pickle('../data/' + HISTORY_NAME)

sns.set()
fig, ax = plt.subplots(figsize=(15,10))
sns.lineplot(data = training_df, ax = ax)
plt.ylabel('Metric')
plt.xlabel('Epoch')
plt.show()

In [None]:
# Visualize training history

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
#estimator = KerasClassifier(build_fn=baseline_model, epochs=20, batch_size=5, verbose=0)

In [None]:
from sklearn.model_selection import KFold

# Use 10 fold cross-validation to reduce risk of bias
#kfold = KFold(n_splits=2, shuffle=True)

In [None]:
from sklearn.model_selection import cross_val_score

#results = cross_val_score(estimator, train_x, train_y, cv=kfold, verbose=1)
#print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

# Evaluate

In [None]:
#print('\n# Evaluate on test data')
results = base_model.evaluate(test_x, test_y)
#print('test loss, test acc:', results)


In [None]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print('\n# Generate predictions for 3 samples')
predictions = base_model.predict(test_x[:3])
print('predictions shape:', predictions.shape)
predictions

## Test predictions

In [None]:
"""
This method takes a sentence and does NER on it

CHECK CLS/SEP tokens??
"""
def NER(sentence, classifier):
    #Format input
    tp, tm = format_sentences_for_BERT(sentence)
    
    #Split on word-level to create a dict
    tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased-ner")
    word_list = tokenizer.tokenize(s)
    
    #Encode it 
    e = get_embeddings_with_gpu(tp, tm)
    
    #Format for NER
    n = format_input_for_NER(e)
    
    #Classify it
    c = base_model.predict(n)
    
    #Start from 1 to skip the cls token
    #Print the predictions
    for i in range(len(word_list)+4):
        
        #Get the predictions for the first word
        predictions = c[i]
        
        #Take the index of the most likely word. This correspond to the class
        index = np.argmax(predictions)
        
        #Translate this index to a label
        #label = tokenizer.decode()
        
        #Grab the certainity
        certainity = predictions[index]
        
        #Check which word this is
        if i <len(word_list):
            word = word_list[i]
        else:
            word = "-"
        
        #Check which entity it corresponds to
        print(f"WORD: {word}, ENTITY: {label_dict.get(index)}, PROBABILITY: {certainity}")

        
    return c, tp
s = "Han var inte den bästa, det var Alexander i Stockholm!!!"
e, tp= NER(s, base_model)

In [None]:
tp.shape