#### Experiment 1

This notebook contains the code for experiment 1. 

## Loading libraries/packages and the training and test data 

In [1]:
import ssl
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual

import numpy as np
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
ssl._create_default_https_context = ssl._create_unverified_context
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from itertools import chain

import nltk
import sklearn 
import scipy.stats
import spacy_dbpedia_spotlight


In [3]:
%store -r test_data

In [4]:
train_sents = list(nltk.corpus.conll2002.iob_sents('ned.train'))

## Feature engineering (credits: https://github.com/TeamHG-Memex/sklearn-crfsuite/blob/master/docs/CoNLL2002.ipynb)

In [5]:
def word2features(sent, i):
    token = sent[i][0]
    features = {
        'word': token
    }
    if i > 0:
        prev = sent[i-1][0]
        features.update({
            '-1:word': prev
        })
    
    if i > 1:
        prev = sent[i-2][0]
        features.update({
            '-2:word': prev
    })
        
    if i < len(sent)-1:
        next_token = sent[i+1][0]
        features.update({
            '+1:word': next_token
        })
        
    if i < len(sent)-2:
        next_token = sent[i+2][0]
        features.update({
            '+2:word': next_token,
            
        })

                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

### Training

In [6]:
training_features = [sent2features(s) for s in train_sents]
training_labels = [sent2labels(s) for s in train_sents]

In [7]:
import sklearn_crfsuite
from sklearn_crfsuite import CRF
from sklearn.metrics import classification_report

crf = CRF(algorithm='l2sgd',
          max_iterations=100,
          all_possible_transitions=False)


In [8]:
crf.fit(training_features, training_labels)


### Feature extraction for test data

In [9]:
test_inputs = []
for snippet in test_data:
    sent_features = [sent2features(s) for s in snippet]
    test_inputs.append(sent_features)

## Model predictions

In [10]:
from sklearn.metrics import classification_report
pred_outputs = []
for biography in test_inputs:
    pred = crf.predict(biography)
    pred_outputs.append(pred)

## Formatting predictions

In [11]:
list_of_pred = [] 
for snippet in pred_outputs:
    for sent in snippet:
        for token in sent:
            list_of_pred.append(token)

In [12]:
pred_test_set = test_data
for snippet_index in range(len(pred_test_set)):
    snippet = pred_test_set[snippet_index]
    for sent_index in range(len(snippet)):
        sent = snippet[sent_index]
        pred_test_set[snippet_index][sent_index]= [list(ele) for ele in sent]

In [13]:
k = 0 
i = 0
for snippet_i in range(len(pred_test_set)):
    snippet = pred_test_set[snippet_i]
    for sent_i in range(len(snippet)):
        sent= pred_test_set[snippet_i][sent_i]
        for token_i in range(len(sent)):
            pred_ner = list_of_pred[k]
            pred_test_set[snippet_i][sent_i][token_i].append(pred_ner)
            k+=1
            i += 1
          

## Visualizing predicted results 

In [14]:
def visualize_biography(text):
    import spacy
    from spacy.tokens import Doc, Span
    from spacy import displacy

    nlp = spacy.load("en_core_web_sm")

    vocab = nlp.vocab
    colors = {"B-PER": "#ADD8E6","I-PER": "#ADD8E6", "B-LOC": "#FFD580", "I-LOC": "#FFD580", "B-ORG":"#90EE90","I-ORG":"#90EE90", "B-MISC":"#D8BFD8","I-MISC":"#D8BFD8"}



    docs = []
  
    for sentence in text:
        without_pos = [[a, c] for [a, b, c] in sentence]
        words, entities = zip(*without_pos)
        doc = Doc(vocab, words=words)
        spans = [Span(doc, i, i + 1, label=ent_type) for i, ent_type in enumerate(entities) if ent_type]
        doc.ents = spans
        docs.append(doc)

    options = {"ents": ["B-PER", "I-PER","B-ORG","I-ORG", "B-LOC","I-LOC","B-MISC","I-MISC"], "colors": colors}
    displacy.render(docs, style="ent", options=options, jupyter=True)
  


In [15]:
def count_tag(tag,test_set): 
    count = 0 
    for bio in test_set:
        for sentence in bio: 
            for token in sentence:
                if token[2] == tag:
                    count += 1
                    
    return count

In [16]:
print('B-PER', count_tag('B-PER',pred_test_set))
print('I-PER',count_tag('I-PER',pred_test_set))
print('B-LOC',count_tag('B-LOC',pred_test_set))
print('I-LOC',count_tag('I-LOC',pred_test_set))
print('B-ORG',count_tag('B-ORG',pred_test_set))
print('I-ORG',count_tag('I-ORG',pred_test_set))
print('B-MISC',count_tag('B-MISC',pred_test_set))
print('I-MISC',count_tag('I-MISC',pred_test_set))

B-PER 347
I-PER 316
B-LOC 261
I-LOC 17
B-ORG 10
I-ORG 12
B-MISC 55
I-MISC 50


In [17]:
i=0
for bio in pred_test_set:
    print("Bio: ",i)
    visualize_biography(bio)
    i+=1

Bio:  0


Bio:  1


Bio:  2


Bio:  3


Bio:  4


Bio:  5


Bio:  6


Bio:  7


Bio:  8


Bio:  9


Bio:  10


Bio:  11


Bio:  12


Bio:  13


Bio:  14


Bio:  15


Bio:  16


Bio:  17


Bio:  18


Bio:  19


Bio:  20


Bio:  21


Bio:  22


Bio:  23


Bio:  24


Bio:  25


Bio:  26


Bio:  27


Bio:  28


Bio:  29


Bio:  30


Bio:  31


Bio:  32


Bio:  33


Bio:  34


Bio:  35


Bio:  36


Bio:  37


Bio:  38


Bio:  39


Bio:  40


Bio:  41


Bio:  42


Bio:  43


Bio:  44


Bio:  45


Bio:  46


Bio:  47


Bio:  48


Bio:  49
