# DATA PREPARATION
We need to format the suc3.0 data, which exist in an xml tree, in such a way that ALBERT can accept it.

Because of my familiarity with dataframes, I'll begin by converting the xml tree into a pandas dataframe


Check this link again 
https://www.vamvas.ch/bert-for-ner/

In [1]:
import pandas as pd
import seaborn as sns
import xml.etree.ElementTree as et 
import matplotlib.pyplot as plt
import numpy as np

### Setup

In [2]:
# The attributes at each level of the corpus

TEXT_attributes = {
 'text_blingbring': '|',
 'text_id': '|',
 'text_lix': '|',
 'text_nk': '|',
 'text_ovix': '|',
 'text_swefn': '|',
 'text_index': 0}

SENTENCE_attributes = {
 'sentence__geocontext': '|',
 'sentence_id': '|',
 'sentence_index': 0}

WORD_attributes = {
 'word_blingbring': '|',
 'word_complemgram': '|',
 'word_compwf': '|',
 'word_dephead': '|',
 'word_deprel': '|',
 'word_lemma': '|:|',
 'word_lex': '|',
 'word_msd': '|',
 'word_pos': '|',
 'word_prefix': '|',
 'word_ref': '|',
 'word_sense': '|',
 'word_suffix': '|',
 'word_swefn': '|',
 'word_ex': '|',
 'word_name': '|',
 'word_subtype': '|',
 'word_type': '|',
 'word_index': 0}

# A dict containing all attributes associated with a single word, increading higher-level ones
WORD_LEVEL_MASTER_DICT = {}
WORD_LEVEL_MASTER_DICT.update(TEXT_attributes)
WORD_LEVEL_MASTER_DICT.update(SENTENCE_attributes)
WORD_LEVEL_MASTER_DICT.update(WORD_attributes)

In [3]:
"""
Some tags/dictionaries have a conflicting namespace for the attributes
This function takes the attributes and adds a prefix to the keynames, to make sure nothing overwrites anything else
Returns the same dict but with updated key names
"""
def rename_attributes(dictionary, prefix):
    new_dict = {}
    for key in dictionary.attrib:
        value = dictionary.get(key)
        new_attrib_name = prefix + "_" + key
        new_dict.update({new_attrib_name:value})
        
    return new_dict

### Convert XML to Dataframe

In [4]:
#Create a tree
xtree = et.parse("../data/suc3.xml")

#Get its root element
suc3 = xtree.getroot()

#Get all children - the texts that we will extract words from
texts = suc3.getchildren()

text_index = 0
sentence_index = 0
word_index = 0


#Create a list of datapoints. Each datapoint will be a word with all the attributes of the word itself, the attributes of the sentence it is part of, and the attributes of the text it is part of.
datapoints = []

# Loop over the texts in the corpus...
for text in texts:
    #Assign an index based on the order it appears in the corpus
    text_index = text_index + 1
    
    # Create a dict of attributes that won't cause a namespace conflict
    text_attributes = rename_attributes(text, 'text')
    
    #Add the index as an attribute
    text_attributes.update({"text_index": text_index})
    
    #Get all the sentences that make up this text
    sentences = text.getchildren()  
    
    for sentence in sentences:
        sentence_index = sentence_index + 1

        sentence_attributes = rename_attributes(sentence, 'sentence')
        sentence_attributes.update({"sentence_index": sentence_index})
   
        words = sentence.getchildren()
        
        for word in words:
            word_index = word_index + 1
            
            
            #For words, there are multiple kinds of attributes depending on the kind of word.
            # WORD_attributes is simply a list of all these attributes joined into one.
            word_attributes = WORD_attributes.copy()
            word_attributes.update({"word_index": word_index})
            
            attributes = rename_attributes(word, 'word')
            
            if word.tag == 'ne':
                attributes.update({"word_tag": "ne"})
            else:
                attributes.update({"word_tag": "w"})
                
                #If it's not a named entity, the text is not stored as an attribute but actual text. We treat it as an attribute for coherence
                attributes.update({"word_name": word.text})
                
            word_attributes.update(attributes)
            
            #Copy the master dict and fill it in with all the information we've extracted from the three loops
            WORD_LEVEL_DICT = WORD_LEVEL_MASTER_DICT.copy()
            
            WORD_LEVEL_DICT.update(text_attributes)
            WORD_LEVEL_DICT.update(sentence_attributes)
            WORD_LEVEL_DICT.update(word_attributes)
            
            datapoints.append(WORD_LEVEL_DICT)
                
print("Done")

  


Done


In [5]:
# Having extracted all words and their associated information, let's convert it into a dataframe
df = pd.DataFrame(datapoints)

#Blingbring looks less interesting
df = df.drop(columns={'text_blingbring'})

#Replace the | with None
df = df.replace({'|': np.nan})
df.head(1)

Unnamed: 0,text_id,text_lix,text_nk,text_ovix,text_swefn,text_index,sentence__geocontext,sentence_id,sentence_index,word_blingbring,...,word_swefn,word_ex,word_name,word_subtype,word_type,word_index,word_tag,word_sentiment,word_sentimentclass,word__overlap
0,aa01c,50.84,1.58,76.88,|Abandonment:95.654|Destroying:87.097|Relation...,1,,e24e30c0-e24d3ca9,1,,...,,,I,,,1,w,,,


In [6]:
# Pickle the data
df.to_pickle('../data/suc3_dataframe')

## Extract only the parts relevant for NER with ALBERT

In [7]:
#Grab the categories the example use
data = df[['sentence_index', 'word_name', 'word_pos', 'word_type', 'word_subtype']]
data.head(5)

Unnamed: 0,sentence_index,word_name,word_pos,word_type,word_subtype
0,1,I,PP,,
1,1,sin,PS,,
2,1,första,RO,,
3,1,reaktion,NN,,
4,1,på,PP,,


## Replace the Nan type of non-entities with O

In [8]:
# The example uses O instaed of Nan, so we follow them
data[['word_type']] = data[['word_type']].replace(np.nan,'O')
data.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,sentence_index,word_name,word_pos,word_type,word_subtype
0,1,I,PP,O,
1,1,sin,PS,O,
2,1,första,RO,O,
3,1,reaktion,NN,O,
4,1,på,PP,O,


## Checklist to make the data compatible
* Get sentences
* Tokenize the sentences
* Put wrongly split tokens together.
* Transform the tokens to id:s
* Define the length L of the input sequence. The longer the sentence the more the context, but also the more processing power needed
* Pad sentences shorter than L with 0
* Crop sentences longer than L
* Tell Albert to ignore padded info (attention_mask = np.where(padded != 0, 1, 0))
* Convert to tensors

## FORMAT WORDS/TOKENS
* Extract all sentences
* Tokenize each sentence
* Convert tokens to IDs
* Ensure each token sequence has the same length (padding, trunctuating)

As a note, Albert and Bert uses a wordpiece tokenizer. This approach has many advantages, though some problems may occur.

For example, "Internet Explorer" is a single, named entity. Yet the tokenizer will split it into "Internet" and Explorer".

For example, a name like Vladmir might get separated into V##, lad##, mir.

To deal with that one can use BIO tagging.

For each token if it is the B(eginning) of a named entity tag it so, if it is an I(ntermediate) part of a named entity tag it so, if it isn't an entity (Outside), tag it so.



In [9]:
from transformers import AutoTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [10]:
#The Bert tokenizer often splits words into muliple tokens however, the subparts which starts with ## as seen above.
#So we need an additional function to restich the split apart tokens

def tokenizer_wrapper(tokenized_text):
    corrected_output = []
    for token in tokenized_text:
        if token.startswith('##'):
            corrected_output[-1] += token[2:]
        else:
            corrected_output.append(token)
    return corrected_output
            
#tokenizer_wrapper(tokenized)

In [11]:
"""
A class for interacting on a sentence level with the pandas dataframe
"""
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["word_name"].values.tolist(),
                                                           s["word_pos"].values.tolist(),
                                                           s["word_type"].values.tolist())]
        self.grouped = self.data.groupby("sentence_index").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        

In [12]:
#Make use of the pre-trained tokenizer from Huggingface
tokenizer = AutoTokenizer.from_pretrained("KB/bert-base-swedish-cased-ner")

In [13]:
#Get all sentences
getter = SentenceGetter(data)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
#sentences[0]

In [14]:
#Tokenize each sentence
tokenized_sentences = []
for sentence in sentences:
    tokenized_sentences.append(tokenizer.tokenize(sentence))
#print(tokenized_sentences[0])

In [15]:
#Convert all tokens into ids
id_sentences = []
for tokenized_sentence in tokenized_sentences:
    id_sentences.append(tokenizer.convert_tokens_to_ids(tokenized_sentence))
print(id_sentences[0])


[135, 243, 578, 10540, 68, 3380, 7245, 49796, 27689, 28413, 26922, 49796, 3206, 121, 3393, 4634, 49796, 2901, 6697, 116, 48, 98, 346, 31843, 24926, 671, 4958, 237, 541, 66, 9926, 21667, 36, 16370, 15191, 42, 696, 98, 7]


In [16]:
#Set all sentences to be of the same length by padding and trucating
MAXLEN = 50
same_size_sequences = pad_sequences(id_sentences, maxlen=MAXLEN, dtype="long", truncating="post", padding="post")
print(same_size_sequences[0])

[  135   243   578 10540    68  3380  7245 49796 27689 28413 26922 49796
  3206   121  3393  4634 49796  2901  6697   116    48    98   346 31843
 24926   671  4958   237   541    66  9926 21667    36 16370 15191    42
   696    98     7     0     0     0     0     0     0     0     0     0
     0     0]


### CREATE Labels

In [17]:
# Add labels for each token in each sentence
labels = [[s[2] for s in sent] for sent in getter.sentences]

tags_vals = list(set(data["word_type"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

#We have a corresponding list of tags, and do the same thing, converting names to numbers
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAXLEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
#print(labels[0])
#print(tags[0])

### CREATE ATTENTION MASKS
Attention masks will tell BERT how important each token in the sentence is.
We put a value of 0 if the token is just padding so that the model ignores it.

In [18]:
#Create attention masks for each sentence

# Attention should be 0 if the token is just padding
sentence_masks = []
for sequence in same_size_sequences:
    sentence_mask = []
    for token_id in sequence:
        sentence_mask.append(float(token_id>0))
    sentence_masks.append(sentence_mask)
#print(sentence_masks[0])

## Sanity checks

In [19]:
#Sanity checks
s = sentence_to_check = 4

print("Sentence")
print(sentences[s])
print()
print("Entities")
print(labels[s])
print()
print("Padded Token ID")
print(same_size_sequences[s])
print()
print("Padded Entity ID")
print(tags[s])
print()
print("Attention Mask")
print(sentence_masks[s])
print()

Sentence
Hur är det då i Mellanöstern ?

Entities
['O', 'O', 'O', 'O', 'O', 'LOC', 'O']

Padded Token ID
[ 1504    54    82   327    31 15894   302     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]

Padded Entity ID
[ 7  7  7  7  7 11  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7]

Attention Mask
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]



In [20]:
#For convenience, let's pickle this for later use

cols=["Sentence", "Token_ID", "Entities", "Entity_ID", "Attention_Mask"]

data_matrix = []

for i in range(0, len(sentences)):
    a=sentences[i]
    b=same_size_sequences[i]
    c=labels[i]
    d=tags[i]
    e=sentence_masks[i]
    
    row = [a, b, c, d, e]
    data_matrix.append(row)
    

df = pd.DataFrame(data_matrix, columns=cols)

df.to_pickle('../data/suc3_formatted')
df

Unnamed: 0,Sentence,Token_ID,Entities,Entity_ID,Attention_Mask
0,I sin första reaktion på Sovjetledarens varnin...,"[135, 243, 578, 10540, 68, 3380, 7245, 49796, ...","[O, O, O, O, O, O, O, O, LOC, O, PRS, O, O, O,...","[7, 7, 7, 7, 7, 7, 7, 7, 11, 7, 12, 7, 7, 7, 7...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,I en ruta talar en kort rad på ryska om att de...,"[135, 59, 17275, 2548, 59, 1337, 1207, 68, 370...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,"- Dels har vi inget index att gå efter , vi kr...","[52, 9077, 108, 186, 1696, 9273, 48, 690, 275,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, TME...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
3,"- Men en deporterad blir aldrig fri , säger Ri...","[52, 299, 59, 41611, 103, 444, 1024, 729, 19, ...","[O, O, O, O, O, O, O, O, O, PRS, O, O, O, O, O...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 7, 7, 7, 7, 7,...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,Hur är det då i Mellanöstern ?,"[1504, 54, 82, 327, 31, 15894, 302, 0, 0, 0, 0...","[O, O, O, O, O, LOC, O]","[7, 7, 7, 7, 7, 11, 7, 7, 7, 7, 7, 7, 7, 7, 7,...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, ..."
...,...,...,...,...,...
74240,Jag var ju ändå hans favoritsysslingsvågerbarn .,"[361, 96, 499, 1532, 699, 4349, 10890, 19427, ...","[O, O, O, O, O, O, O]","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
74241,"Du hör ju inte ens till släkten , du är ju ing...","[631, 1009, 499, 127, 2096, 76, 17549, 19, 356...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
74242,""" Det är en mycket fin slips .","[98, 160, 54, 59, 408, 577, 26875, 7, 0, 0, 0,...","[O, O, O, O, O, O, O, O]","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, ..."
74243,Det är klart att vi ska ärva .,"[160, 54, 1798, 48, 186, 326, 45460, 7, 0, 0, ...","[O, O, O, O, O, O, O, O]","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, ..."


## Actually, it's probably easier to just deal with sentences and labels.
Figures

In [21]:
cols=["Sentence", "Labels"]

data_matrix = []

for i in range(0, len(sentences)):
    a=sentences[i]
    b=labels[i]
    
    row = [a,b]
    data_matrix.append(row)
    
df = pd.DataFrame(data_matrix, columns=cols)

df.to_pickle('../data/sentence_labels')


In [22]:
# Add labels for each token in each sentence
labels = [[s[2] for s in sent] for sent in getter.sentences]

tags_vals = list(set(data["word_type"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

#We have a corresponding list of tags, and do the same thing, converting names to numbers
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAXLEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
print(labels[0])
print(tags[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'PRS', 'O', 'O', 'O', 'O', 'PRS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[ 7  7  7  7  7  7  7  7 11  7 12  7  7  7  7 12  7  7  7  7  7  7  7  7
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7  7
  7  7]


In [23]:
# Take the named entities.
# Take their indices and keep in a list UNVISITED
# For each word in the tokenized sequence

# Are we at an index where an NE should be?
# If NO, does the word end with ## (indication that it's been split up)
# if 
# If it does not end with ##, continue as per usual.
# If it does end with ##,check if it is part of a named entity.
# If it is NOT part of a NE, increase the index values for all UNVISITED since a new token has appeard before their original position
# Continue
# If it IS a part of a NE, join the parts to a single, named 

label_df = pd.DataFrame(columns={"Named_Entity", "Original_Pos", "New_Pos"})
label_df

Unnamed: 0,Original_Pos,Named_Entity,New_Pos


In [24]:
def tokenizer_wrapper(tokenized_text):
    corrected_output = []
    for token in tokenized_text:
        if token.startswith('##'):
            corrected_output[-1] += token[2:]
        else:
            corrected_output.append(token)
    return corrected_output
            
#tokenizer_wrapper(tokenized)



In [26]:
label_df

Unnamed: 0,Original_Pos,Named_Entity,New_Pos


In [None]:
UNVISITED + np.ones(len(UNVISITED))

In [None]:
sentence_list = [([s[0] for s in sent]) for sent in getter.sentences]

s = sentence_list[0]
l = labels[0]
t = tokenizer.tokenize(sentences[0])

print(s)
print()
print(l)
print()
print(t)

#TODO SovjetLedaren är inte en NE, men när den bryts upp till Sovjet och Ledaren är Sovjet en NE
ne_indices = []
nes = []
for i, label in enumerate(l):
    if label != 'O':
        ne_index = i
        ne = s[i]
        ne_indices.append(ne_index)
        nes.append(ne)

print(ne_indices)
print(nes)



UNVISITED = ne_indices
UNVISITED.reverse()
ne_index = UNVISITED[0]
for i, token in enumerate(t):
    
    #If we have not reached the named entity
    if i < ne_index:
        #Check if the current word has been split up
        if token.startswith('##'):
            #If so, increase the values of all unvisited NE:s, since the indices will have been shifted
            continue

