# Preprocessing and tokenization

In this notebook, the test data is preprocessed and tokenized. We firstly remove the html code that can be in the biographies. Afterwards, we perform sentence splitting and word tokenization.

## Preprocessing 

In [1]:
#importing libraries 
import pandas as pd 
import nltk
from bs4 import BeautifulSoup
import spacy
from spacy.attrs import ORTH, NORM

In [2]:
#retrieving the test data
test_set = pd.read_csv("test_set_1.csv")

In [3]:
#saving biography with its identifier in a list 
biography_snippets = []
for index, row in test_set.iterrows():
    biography_snippets.append([row['bio_part'],row['bio']])

In [5]:
#cleaning the biographies
bio_parts = []
cleaned_snippets = []
i=0
for bio in biography_snippets:
    bio_parts.append([bio[0],i])
    cleaned_text = BeautifulSoup(bio[1], "lxml").text
    cleaned_snippets.append(cleaned_text)
    i+=1
    

In [7]:
%store bio_parts

Stored 'bio_parts' (list)


## Tokenization of biographies 

In [8]:
#loading spacy model 
nlp = spacy.load("nl_core_news_md")
nlp.tokenizer.add_special_case('begr.', [{ORTH: 'begr.', NORM: 'begraven'}])
nlp.tokenizer.add_special_case('gest.', [{ORTH: 'gest.', NORM: 'gestorven'}])
nlp.add_pipe("dbpedia_spotlight")

<spacy_dbpedia_spotlight.entity_linker.EntityLinker at 0x243605a2b20>

In [9]:
#sentence splitting and tokenization 
test_snippets = [] 
for snippet_index in range(len(cleaned_snippets)):
    snippet = []
    doc = nlp(cleaned_snippets[snippet_index])
    for sent in doc.sents:
        snippet.append([token.text for token in sent])
    test_snippets.append(snippet)

In [10]:
# adding POS tag per token 
test_data = []
pos_tags_per_sentence = []
for snippet in test_snippets:
    biography = [] 
    for sent in snippet:
        tagged = nltk.pos_tag(sent)
        biography.append(tagged)
    
    test_data.append(biography)

In [11]:
%store test_data

Stored 'test_data' (list)
