## Importing Packages

import pandas as pd
import os
import sys
import spacy
import re
from spacy.tokens import Doc
from collections import OrderedDict
import operator
nlp = spacy.load('en_core_web_sm')

## Importing functions from file conll

In [2]:
from conll import evaluate

## Evaluate spaCy NER on CoNLL 2003 data

In [3]:
spacy2conllDict = {'PERSON':'PER', 'NORP':'MISC', 'LOC':'LOC', 'FAC':'LOC', 'GPE':'LOC', 'ORG':'ORG', 'PRODUCT':'MISC', 'EVENT':'MISC', 'WORK_OF_ART':'O', 'LAW':'O', 'LANGUAGE':'MISC', 'DATE':'O', 'TIME':'O', 'PERCENT':'O', 'MONEY':'O', 'QUANTITY':'O', 'ORDINAL':'O', 'CARDINAL':'O', '':'O'}

In [4]:
def concatEntIob(iob, ent):
    if spacy2conllDict[ent] == 'O':
        return 'O'
    else:
        return iob + '-' + spacy2conllDict[ent]


## Reading conll2003 data

In [5]:
class ReadConll2003:

    def __init__(self, path_to_text):
        self.sents = [] 
        self.txts = [] 
        self.jtxts = [] 
        lines = None
        with open(path_to_text) as f:
            lines = f.readlines()

        lines = lines[2:] 
        sent = []
        
        for line in lines:
            if line != "\n":
                l = line.replace('\n', '').split()
                sent.append(tuple(l))
            else:
                self.sents.append(sent)
                sent = []
        
        for sent in self.sents:
            tsent = [list(el) for el in sent]
            el = [s[0] for s in tsent]
            self.txts.append(el)
            self.jtxts.append(" ".join(el))

### 1a) <br> Report token-level performance (per class and total)

    accuracy of correctly recognizing all tokens that belong to named entities (i.e. tag-level accuracy)


In [6]:
def CalAcc(dataSent, spaSent):
    # creating variable
    borgAccuracy,borgCorrect, borgError = 0, 0, 0
    iorgAccuracy, iorgCorrect, iorgError = 0, 0, 0
    blocAccuracy, blocCorrect, blocError = 0, 0, 0
    ilocAccuracy, ilocCorrect, ilocError = 0, 0, 0
    bmiscAccuracy, bmiscCorrect, bmiscError = 0, 0, 0
    imiscAccuracy, imiscCorrect, imiscError = 0, 0, 0
    bperAccuracy, bperCorrect, bperError = 0, 0, 0
    iperAccuracy, iperCorrect, iperError = 0, 0, 0
    oAccuracy, oCorrect, oError = 0, 0, 0
    totAccuracy, totCorrect, totError = 0, 0, 0
    
    for cont in range(len(dataSent)):
        for i, el in enumerate(dataSent[cont]):
            if el[3] == spaSent[cont][i][1]:
                totCorrect += 1
                if el[3] == 'B-ORG':
                    borgCorrect += 1
                elif el[3] == 'I-ORG':
                    iorgCorrect += 1
                elif el[3] == 'B-MISC':
                    bmiscCorrect += 1
                elif el[3] == 'I-MISC':
                    imiscCorrect += 1
                elif el[3] == 'B-PER':
                    bperCorrect += 1
                elif el[3] == 'I-PER':
                    iperCorrect += 1
                elif el[3] == 'B-LOC':
                    blocCorrect += 1
                elif el[3] == 'I-LOC':
                    ilocCorrect += 1
                else:
                    oCorrect += 1
            else:
                totError += 1
                if el[3] == 'B-ORG':
                    borgError += 1
                elif el[3] == 'I-ORG':
                    iorgError += 1
                elif el[3] == 'B-MISC':
                    bmiscError += 1
                elif el[3] == 'I-MISC':
                    imiscError += 1
                elif el[3] == 'B-PER':
                    bperError += 1
                elif el[3] == 'I-PER':
                    iperError += 1
                elif el[3] == 'B-LOC':
                    blocError += 1
                elif el[3] == 'I-LOC':
                    ilocError += 1
                else:
                    oError += 1

    borgAccuracy = borgCorrect/(borgCorrect+borgError)
    iorgAccuracy = iorgCorrect/(iorgCorrect+iorgError)

    blocAccuracy = blocCorrect/(blocCorrect+blocError)
    ilocAccuracy = ilocCorrect/(ilocCorrect+ilocError)

    bmiscAccuracy = bmiscCorrect/(bmiscCorrect+bmiscError)
    imiscAccuracy = imiscCorrect/(imiscCorrect+imiscError)

    bperAccuracy = bperCorrect/(bperCorrect+bperError)
    iperAccuracy = iperCorrect/(iperCorrect+iperError)

    oAccuracy = oCorrect/(oCorrect+oError)

    totAccuracy = totCorrect/(totCorrect+totError)

    accuracy = {
        'B-ORG': {'Accuracy': borgAccuracy}, 'I-ORG': {'Accuracy': iorgAccuracy}, 
        'B-LOC': {'Accuracy': blocAccuracy}, 'I-LOC': {'Accuracy': ilocAccuracy}, 
        'B-MISC': {'Accuracy': bmiscAccuracy}, 'I-MISC': {'Accuracy': imiscAccuracy},
        'B-PER': {'Accuracy': bperAccuracy}, 'I-PER': {'Accuracy': bperAccuracy}, 
        'O': {'Accuracy': oAccuracy}, 'total': {'Accuracy': totAccuracy}
    }

    pd_tblAccuracy = pd.DataFrame().from_dict(accuracy, orient='index')
    pd_tblAccuracy.round(decimals=3)
    return pd_tblAccuracy

### 1b) <br> Report CoNLL chunk-level performance (per class and total)

    precision, recall, f-measure of correctly recognizing all the named entities in a chunk per class and total

In [7]:
def conllEvaluation(conll2003, docs):
    
    refs = [[(text, iob) for text, x, y, iob in sent] for sent in conll2003.sents]
    hyps = [[(text, iob) for text, iob in sent] for sent in docs]
    results = evaluate(refs, hyps)

    pd_tbl = pd.DataFrame().from_dict(results, orient='index')
    pd_tbl.round(decimals=3)
    return pd_tbl

In [8]:
def tokenRectification(sent):
    # Correction based on whitespace
    new_sent = []
    i = 0
    while i < len(sent):
        
        el = [sent[i][0], sent[i][1]]
        while sent[i][2] == '':
            i += 1
            if i == len(sent):
                break
            el[0] += sent[i][0]
        new_sent.append(tuple(el))
        i += 1
    return new_sent


In [9]:
def spacy2conll(docs):
    hyps = []
    for sent in docs:
        mod_sent = []
        for t in sent:
            mod_sent.append((t.text, concatEntIob(t.ent_iob_, t.ent_type_), t.whitespace_))

        c_sent = tokenRectification(mod_sent)
        hyps.append(c_sent)
    return hyps

### 2) <br> Grouping of Entities. Write a function to group recognized named entities using noun_chunks method of spaCy. Analyze the groups in terms of most frequent combinations (i.e. NER types that go together)

In [10]:
def groupEntities(docs):

    all_ents = []
    for doc in docs:
        doc_ents = []
        for chunk in doc.noun_chunks:
            chunk_ents = []
            for cent in chunk.ents:
                for ent in doc.ents:
                    if cent == ent:
                        chunk_ents.append(ent)
            doc_ents.append(chunk_ents)

        for ed in doc.ents:
            ex = True
            for a in doc_ents:
                for b in a:
                    if ed == b:
                        ex = False  
            if ex:
                doc_ents.append([ed])   

        ents = []
        for ed in doc_ents:
            chunk = []
            for a in ed:
                chunk.append(a.label_)
            ents.append(chunk)
        all_ents.append(ents)
    return all_ents

In [11]:
def statFreq(docs, n):
    stats = {}
    g_ents = groupEntities(docs)
    for sent in g_ents:
        for l in sent:
            if tuple(l) in stats:
                stats[tuple(l)] += 1
            else:
                stats[tuple(l)] = 1
    stats.pop(())
    sort = sorted(stats.items(),key=operator.itemgetter(1),reverse=True)[0:n]
    dict_for_table = {}
    for el in sort:
        k = ""
        for s in el[0]:
            k = k+s+"_"
        dict_for_table[k] = {'NoT': el[1]}


    pd_tbl_freq = pd.DataFrame().from_dict(dict_for_table, orient='index')
    pd_tbl_freq = pd_tbl_freq.sort_values(by=['NoT'], ascending=False)
    pd_tbl_freq.round(decimals=3)
    return pd_tbl_freq

### 3) <br>One of the possible post-processing steps is to fix segmentation errors. Write a function that extends the entity span to cover the full noun-compounds. Make use of compound dependency relation.

In [12]:
def compoundRectification(docs):
    hyps = []
    for sent in docs:
        mod_sent = []
        for t in sent:
            if t.dep_ != 'compound' or t.ent_type_ != '':
                mod_sent.append((t.text, concatEntIob(t.ent_iob_, t.ent_type_), t.whitespace_))
            else:
                if (t.head.i < t.i):
                    mod_sent.append((t.text, concatEntIob('I', t.head.ent_type_), t.whitespace_))
                else:
                    mod_sent.append((t.text, concatEntIob('B', t.head.ent_type_), t.whitespace_))
            
        c_sent = tokenRectification(mod_sent)
        hyps.append(c_sent)
    return hyps

# Evaluation

In [13]:
PATH = 'data/test.txt'

conll2003 = ReadConll2003(PATH)
docs = [nlp(sent) for sent in conll2003.jtxts]
docs_corrected = spacy2conll(docs)

### 1a) <br> Report token-level performance (per class and total)

    accuracy of correctly recognizing all tokens that belong to named entities (i.e. tag-level accuracy)

In [14]:
# First Request - Part 1
accuracy = CalAcc(conll2003.sents, docs_corrected)
print('Token level accuracy: \n', accuracy)

Token level accuracy: 
         Accuracy
B-LOC   0.704436
B-MISC  0.588319
B-ORG   0.326911
B-PER   0.603587
I-LOC   0.595331
I-MISC  0.425926
I-ORG   0.548503
I-PER   0.603587
O       0.975436
total   0.906397


### 1b) <br> Report CoNLL chunk-level performance (per class and total);

    precision, recall, f-measure of correctly recognizing all the named entities in a chunk per class and total

In [15]:
# First Request - Part 2
chunk_table = conllEvaluation(conll2003, docs_corrected)
print('Chunk level accuracy: \n', chunk_table)

Chunk level accuracy: 
               p         r         f     s
ORG    0.445386  0.284768  0.347411  1661
PER    0.688906  0.568336  0.622840  1617
MISC   0.756144  0.569801  0.649878   702
LOC    0.777554  0.693645  0.733207  1668
total  0.668253  0.522132  0.586224  5648


### 2) <br> Grouping of Entities. Write a function to group recognized named entities using noun_chunks method of spaCy. Analyze the groups in terms of most frequent combinations (i.e. NER types that go together)

In [16]:
N = 20
best_n = statFreq(docs, N)
print('Best', N, ': \n', best_n)

Best 20 : 
                    NoT
CARDINAL_         1606
GPE_              1281
DATE_             1164
PERSON_           1157
ORG_               958
NORP_              303
MONEY_             151
ORDINAL_           112
TIME_               99
PERCENT_            76
QUANTITY_           76
EVENT_              68
LOC_                50
NORP_PERSON_        45
CARDINAL_PERSON_    31
GPE_PERSON_         29
ORG_PERSON_         28
FAC_                26
PRODUCT_            25
CARDINAL_NORP_      14


### 3) <br>One of the possible post-processing steps is to fix segmentation errors. Write a function that extends the entity span to cover the full noun-compounds. Make use of compound dependency relation.

In [17]:
# Third request
docs_compound_corr = compoundRectification(docs)

new_accuracy = CalAcc(conll2003.sents, docs_compound_corr)
print('New Token level accuracy: \n', new_accuracy)

New Token level accuracy: 
         Accuracy
B-LOC   0.704436
B-MISC  0.589744
B-ORG   0.328116
B-PER   0.615337
I-LOC   0.595331
I-MISC  0.425926
I-ORG   0.548503
I-PER   0.615337
O       0.969574
total   0.902025


In [18]:
new_chunk_table = conllEvaluation(conll2003, docs_compound_corr)
print('New chunk level accuracy: \n', new_chunk_table)

New chunk level accuracy: 
               p         r         f     s
ORG    0.431569  0.284768  0.343127  1661
PER    0.597529  0.568336  0.582567  1617
MISC   0.750469  0.569801  0.647773   702
LOC    0.763193  0.693645  0.726759  1668
total  0.629725  0.522132  0.570903  5648


# <center> Thank You 