# Relationship network extraction

### References:

- https://archive.org/stream/OneHundredYearsOfSolitude_201710/One_Hundred_Years_of_Solitude_djvu.txt
- https://spacy.io/usage/spacy-101
- https://spacy.io/api/annotation#named-entities

In [1]:
import pandas as pd
import re
import spacy

In [2]:
#!python -m spacy download en_core_web_sm

In [356]:
def get_chapters(book: str) -> [str]:
    return [c.strip() for c in re.split('Chapter \d', book)][1:]

def get_pages(book):
    pages = []
    last_page_end = 0
    for p in re.finditer('\n(\d+) \\n?', book):
        pages.append((book[last_page_end:p.end()], {
        'start': p.start(), 
        'end': p.end(),
        'page': int(p.groups()[0])
        }))
        last_page_end = p.end()
    return pages


## Load book

In [4]:
book_path = '../data/one_hundred_years_of_solitude_EN.txt'

with open(book_path, 'r') as f:
    book = f.read()

In [353]:
chapters = get_chapters(book)

In [354]:
len(book), len(chapters)

(821643, 20)

In [355]:
pages = get_pages(book)

## Parsing chapters and pages

In [370]:
from spacy.tokens import Doc, Span, Token
from spacy.matcher import PhraseMatcher, Matcher

In [349]:
Token.set_extension("page", default=None, force=True)
Doc.set_extension("page", default=None, force=True)

In [422]:
nlp = spacy.load("en_core_web_sm")

In [423]:
docs = []

for doc, context in nlp.pipe(pages, as_tuples=True):
    doc._.page = context["page"]
    for t in doc:
        t._.page = context["page"]
    
    docs.append(doc)

In [426]:
docs[0]._.page, docs[1]._.page

(9, 10)

In [427]:
book_doc = nlp(book)

In [434]:
print (f'Nº chapters: {len(chapters)}')
print (f'Nº pages: {len(pages)}')
print (f'Max pages: {max(pages, key=lambda p: p[1]["page"])[1]["page"]}')
print(f'Nº sentences: {len(list(book_doc.sents))}')
print(f'Vocabulary size: {book_doc.vocab.length}')
print(f'Nº tokens: {len(book_doc)}')
print(f'Nº characters: {len(book)}')

Nº chapters: 20
Nº pages: 192
Max pages: 201
Nº sentences: 6687
Vocabulary size: 10006
Nº tokens: 171099
Nº characters: 821643


## Token identification

In [435]:
idx = 171094
book_doc[idx], book_doc[idx].orth, book_doc[idx].idx

(THE, 6398231955146299758, 821629)

In [12]:
idx = 171095
book_doc[idx], book_doc[idx].orth, book_doc[idx].idx

(END, 14349458037152488889, 821633)

In [13]:
book_doc.vocab.strings[14349458037152488889]

'END'

In [14]:
book_doc.vocab.strings['Aureliano']

3228277545756960083

In [15]:
found = 0
for i, t in enumerate(book_doc):
    if t.text == 'Aureliano':
        print(t.orth, i)
        found += 1
    if found == 2:
        break

3228277545756960083 14
3228277545756960083 2168


In [16]:
book_doc[14], book_doc[2168]

(Aureliano, Aureliano)

In [17]:
book_doc[14].i, book_doc[2168].i

(14, 2168)

### Observations

- use attribute `i` to reference a token in the document
- use attribute `orth` to get the token id in the vocabulary

## Entities

### Most mentioned characters

In [439]:
entities_lst = []

for doc in docs:
    for ent in doc.ents:
        e = {'entity': ent, 'text': ent.text, 'label': ent.label_, 'pos': ent.start }
        entities_lst.append(e)

In [440]:
entities_df = pd.DataFrame(entities_lst)

In [441]:
entities_df.shape

(6507, 4)

In [442]:
import unidecode

def process_text(text):
    text = text.strip()
    text = text.lower()
    text = unidecode.unidecode(text)
    return text

In [443]:
entities_df['text_clean'] = entities_df.text.apply(lambda t: process_text(t))

In [444]:
entities_df.label.unique()

array(['LAW', 'PERSON', 'TIME', 'ORG', 'DATE', 'ORDINAL', 'GPE',
       'CARDINAL', 'LOC', 'NORP', 'WORK_OF_ART', 'FAC', 'EVENT', 'MONEY',
       'LANGUAGE', 'PRODUCT', 'QUANTITY'], dtype=object)

In [445]:
person_df = entities_df[entities_df.label == 'PERSON'].groupby('text_clean').size().reset_index(name='total').sort_values('total', ascending=False)

In [446]:
person_df.shape[0], "characters"

(224, 'characters')

In [447]:
person_df.head(20)

Unnamed: 0,text_clean,total
212,ursula,317
27,aureliano,244
38,aureliano segundo,142
107,jose arcadio,136
32,aureliano buendia,132
178,rebeca,114
111,jose arcadio buendia,98
139,melquiades,75
6,amaranta ursula,71
114,jose arcadio segundo,69


In [448]:
locations_df = entities_df[entities_df.label == 'LOC'].groupby('text_clean').size().reset_index(name='total').sort_values('total', ascending=False)

In [449]:
locations_df.shape[0], 'locations'

(31, 'locations')

In [450]:
locations_df.head()

Unnamed: 0,text_clean,total
8,earth,21
5,caribbean,6
1,arcadio,4
4,aurelianos,4
9,europe,3


In [451]:
# GPE stands for Geopolitical entity
geopolitical_df = entities_df[entities_df.label == 'GPE'].groupby('text_clean').size().reset_index(name='total').sort_values('total', ascending=False)

In [452]:
geopolitical_df.shape[0], 'geopolitical entities'

(126, 'geopolitical entities')

In [453]:
geopolitical_df.head()

Unnamed: 0,text_clean,total
3,amaranta,178
60,macondo,128
13,aureliano,45
100,santa sofia de la piedad,38
26,buendia,28


### Entity refenrece in the text

In [454]:
[e['entity'].start for i, e in entities_df[entities_df.text == 'Melquiades'][:10].iterrows()]

[159, 260, 667, 774, 1706, 3, 506, 538, 614, 697]

In [455]:
span = entities_df[entities_df.text == 'Melquiades'].iloc[0].entity
token = span[0]

In [456]:
span.start, token.i

(159, 159)

In [457]:
book_doc[span.start], book_doc[159], book_doc[260], token

(Melquiades, Melquiades, Melquiades, Melquiades)

### POS tags in the text

In [458]:
pos_tags = [
    'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 
    'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SPACE', 'SYM', 'VERB', 'X'
]

In [459]:
i = 0
for token in book_doc:
    if token.pos_ == 'PRON':
        print (token)
        i += 1
    if i == 10:
        break

he
him
them
it
they
they
who
himself
what
he


## Noun-chunks

In [460]:
i = 0
for nc in book_doc.noun_chunks:
    print (nc, list(nc.subtree))
    i += 1
    if i == 10:
        break

he [he]
the firing squad [the, firing, squad]
Colonel Aureliano Buendfa [Colonel, Aureliano, Buendfa]
distant afternoon [distant, afternoon, when, his, father, took, him, to, discover, ice]
his father [his, father]
him [him]
ice [ice]
that time [that, time]
Macondo [Macondo]
a village [a, village, of, 
, twenty, adobe, houses, ,, built, on, the, bank, of, a, river, of, clear, water, that, ran, along, a, bed, of, polished, 
, stones, ,, which, were, white, and, enormous, ,, like, prehistoric, eggs]


# Matching misclassified entities

In [461]:
entities_df.shape

(6507, 5)

In [462]:
entities_df[entities_df.label == 'PERSON'].shape

(2452, 5)

In [463]:
entities_df[entities_df.label == 'PERSON'].text_clean.unique().shape

(224,)

In [464]:
entities_df[entities_df.text_clean == 'jose arcadio buendia'].label.unique()

array(['PERSON', 'FAC', 'ORG'], dtype=object)

In [465]:
entities_df[entities_df.text_clean == 'ursula'].label.unique()

array(['PERSON', 'PRODUCT', 'NORP', 'LANGUAGE', 'GPE', 'ORG'],
      dtype=object)

- There are 6464 entities, 2452 `PERSON` entities
- Some ocurrences of these characters were incorrectly classified (they should always be labeled as `PERSON`)
- To prevent this situation, we'll create an heuristic to fix theses problems based on other ocurrences of these entities.

In [466]:
text_entities = entities_df.groupby(['text_clean', 'label']).size().reset_index(name='total').sort_values('total', ascending=False)

In [467]:
text_entities.shape

(1544, 3)

#### Entities mismatch examples

In [468]:
text_entities.head()

Unnamed: 0,text_clean,label,total
1499,ursula,PERSON,317
359,aureliano,PERSON,244
597,first,ORDINAL,197
278,amaranta,GPE,178
951,one,CARDINAL,171


In [469]:
text_entities[text_entities.text_clean == 'ursula'] 

Unnamed: 0,text_clean,label,total
1499,ursula,PERSON,317
1497,ursula,NORP,21
1495,ursula,GPE,8
1500,ursula,PRODUCT,3
1498,ursula,ORG,1
1496,ursula,LANGUAGE,1


In [470]:
text_entities[text_entities.text_clean == 'amaranta'] 

Unnamed: 0,text_clean,label,total
278,amaranta,GPE,178
280,amaranta,PERSON,43
279,amaranta,NORP,6


In [471]:
text_entities[text_entities.text_clean == 'jose arcadio buendia'] 

Unnamed: 0,text_clean,label,total
738,jose arcadio buendia,PERSON,98
736,jose arcadio buendia,FAC,12
737,jose arcadio buendia,ORG,5


In [472]:
max_ids = text_entities.groupby('text_clean').idxmax()

In [473]:
entities_true_label = text_entities.loc[max_ids.total] 

In [474]:
entities_true_label.shape

(1422, 3)

In [475]:
entities_true_label[entities_true_label.text_clean == 'jose arcadio buendia'] 

Unnamed: 0,text_clean,label,total
738,jose arcadio buendia,PERSON,98


In [476]:
entities_true_label[entities_true_label.text_clean == 'ursula'] 

Unnamed: 0,text_clean,label,total
1499,ursula,PERSON,317


In [477]:
entities_true_label[entities_true_label.text_clean == 'amaranta'] 

Unnamed: 0,text_clean,label,total
278,amaranta,GPE,178


In [478]:
entities_true_label.sort_values('total', ascending=False).head(10)

Unnamed: 0,text_clean,label,total
1499,ursula,PERSON,317
359,aureliano,PERSON,244
597,first,ORDINAL,197
278,amaranta,GPE,178
951,one,CARDINAL,171
375,aureliano segundo,PERSON,142
732,jose arcadio,PERSON,136
368,aureliano buendia,PERSON,132
796,macondo,GPE,128
578,fernanda,ORG,118


#### First approch
- There are 1508 unique entities considering its `text` and `label`
- Fixing entitities that have more than one label for the same text by assuming the label with highest occurance is the correct one, we get a total of 1387 unique entities
- We can see that is fixes the cases for the characters `Ursula` and `jose arcadio buendia`, but at the same time it classifies the character `Amaranta` as a `Geopolitical Entity (GPE)` and `Fernanda` as an `Organization (ORG)`

#### Second approach:
- all entities that have at least one match as PERSON for the same text will be considered a PERSON entity in all occurances

In [479]:
person_df = person_df.set_index('text_clean')

In [480]:
person_df.head()

Unnamed: 0_level_0,total
text_clean,Unnamed: 1_level_1
ursula,317
aureliano,244
aureliano segundo,142
jose arcadio,136
aureliano buendia,132


In [481]:
text_entities['label_fix'] = text_entities.apply(lambda e: 'PERSON' if e.text_clean in person_df.index else e.label, axis=1)

In [482]:
text_entities[text_entities.text_clean == 'jose arcadio buendia'] 

Unnamed: 0,text_clean,label,total,label_fix
738,jose arcadio buendia,PERSON,98,PERSON
736,jose arcadio buendia,FAC,12,PERSON
737,jose arcadio buendia,ORG,5,PERSON


In [483]:
text_entities[text_entities.text_clean == 'amaranta'] 

Unnamed: 0,text_clean,label,total,label_fix
278,amaranta,GPE,178,PERSON
280,amaranta,PERSON,43,PERSON
279,amaranta,NORP,6,PERSON


In [484]:
text_entities[text_entities.text_clean == 'ursula'] 

Unnamed: 0,text_clean,label,total,label_fix
1499,ursula,PERSON,317,PERSON
1497,ursula,NORP,21,PERSON
1495,ursula,GPE,8,PERSON
1500,ursula,PRODUCT,3,PERSON
1498,ursula,ORG,1,PERSON
1496,ursula,LANGUAGE,1,PERSON


In [485]:
text_entities[text_entities.label_fix == 'PERSON'].shape

(315, 4)

In [486]:
text_entities[text_entities.label_fix == 'PERSON'].groupby('label_fix')['total'].sum()

label_fix
PERSON    3296
Name: total, dtype: int64

### Befor vs after PERSON fix
- 223 vs 313 unique PERSON entities
- 2452 vs 3287 PERSON entities occurances

In [539]:
person_fix_df = text_entities[text_entities.label_fix == 'PERSON'].groupby(['label_fix', 'text_clean']).size().reset_index(name='total').sort_values('total', ascending=False)

In [540]:
person_fix_df.head()

Unnamed: 0,label_fix,text_clean,total
212,PERSON,ursula,6
78,PERSON,fernanda,5
139,PERSON,melquiades,5
27,PERSON,aureliano,5
56,PERSON,catarino,4


In [541]:
person_fix_df.shape, person_fix_df.text_clean.unique().shape

((224, 3), (224,))

In [542]:
entities_df.shape

(6507, 5)

In [543]:
entities_df.head()

Unnamed: 0,entity,text,label,pos,text_clean
0,"(Chapter, 1, \n\n\n, MANY, YEARS, LATER)",Chapter 1 \n\n\nMANY YEARS LATER,LAW,0,chapter 1 \n\n\nmany years later
1,"(Aureliano, Buendfa)",Aureliano Buendfa,PERSON,14,aureliano buendfa
2,(afternoon),afternoon,TIME,22,afternoon
3,"(the, bank, of, a, river)",the bank of a river,ORG,47,the bank of a river
4,"(the, \n, month, of, March)",the \nmonth of March,DATE,102,the \nmonth of march


In [544]:
entities_person_fix_df = pd.merge(entities_df, person_fix_df, on='text_clean', how="left")

In [545]:
entities_person_fix_df.label_fix = entities_person_fix_df.label_fix.fillna(entities_person_fix_df.label)

In [546]:
entities_person_fix_df = entities_person_fix_df.drop('total', axis=1)

In [547]:
entities_person_fix_df.head()

Unnamed: 0,entity,text,label,pos,text_clean,label_fix
0,"(Chapter, 1, \n\n\n, MANY, YEARS, LATER)",Chapter 1 \n\n\nMANY YEARS LATER,LAW,0,chapter 1 \n\n\nmany years later,LAW
1,"(Aureliano, Buendfa)",Aureliano Buendfa,PERSON,14,aureliano buendfa,PERSON
2,(afternoon),afternoon,TIME,22,afternoon,TIME
3,"(the, bank, of, a, river)",the bank of a river,ORG,47,the bank of a river,ORG
4,"(the, \n, month, of, March)",the \nmonth of March,DATE,102,the \nmonth of march,DATE


In [548]:
first_and_last_pos_df = entities_person_fix_df.groupby('text_clean').agg(first_pos=('pos', 'min'), last_pos=('pos', 'max'))

In [549]:
first_and_last_pos_df.shape

(1422, 2)

In [550]:
entities_person_fix_df = pd.merge(entities_person_fix_df, first_and_last_pos_df, on='text_clean', how="left")

In [551]:
entities_person_fix_pos_df.head()

Unnamed: 0,entity,text,label,pos,text_clean,label_fix,first_pos,last_pos
0,"(Chapter, 1, \n\n\n, MANY, YEARS, LATER)",Chapter 1 \n\n\nMANY YEARS LATER,LAW,0,chapter 1 \n\n\nmany years later,LAW,0,0
1,"(Aureliano, Buendfa)",Aureliano Buendfa,PERSON,14,aureliano buendfa,PERSON,14,746
2,(afternoon),afternoon,TIME,22,afternoon,TIME,22,807
3,"(the, bank, of, a, river)",the bank of a river,ORG,47,the bank of a river,ORG,47,47
4,"(the, \n, month, of, March)",the \nmonth of March,DATE,102,the \nmonth of march,DATE,102,102


In [552]:
entities_person_fix_pos_df.iloc[0].entity[0]._.page

9

In [553]:
entities_person_fix_pos_df['page'] = entities_person_fix_pos_df.entity.apply(lambda e: e[0]._.page)

In [554]:
first_and_last_page_df = entities_person_fix_pos_df.groupby('text_clean').agg(first_page=('page', 'min'), last_page=('page', 'max'))

In [555]:
first_and_last_page_df.shape

(1422, 2)

In [556]:
entities_person_fix_page_df = pd.merge(entities_person_fix_pos_df, first_and_last_page_df, on='text_clean', how="left")

In [626]:
entities_person_fix_page_df[entities_person_fix_page_df.text == 'MACONDO']

Unnamed: 0,entity,text,label,pos,text_clean,label_fix,first_pos,last_pos,page,first_page,last_page
640,(MACONDO),MACONDO,ORG,824,macondo,PERSON,8,1745,29,9,199


In [645]:
entities_info = {}
entities_by_page = {}

for i, row in entities_person_fix_page_df.iterrows():
    key = row['text_clean']
    if row['label_fix'] != 'PERSON' or row['text'][0].islower():
        continue
        
    if row['page'] in entities_by_page:
        entities_by_page[row['page']].append(key)
    else:
        entities_by_page[row['page']] = [key]
    
    if key not in entities_info:
        entities_info[key] = {
            'first_pos': row['first_pos'],
            'last_pos': row['last_pos'],
            'first_page': row['first_page'],
            'last_page': row['last_page'],
            'type': row['label_fix'],
            'pos': [row['pos']],
            'pages': {row['page']},
            'name': row['text'],
        }
    else:
        entities_info[key]['pos'].append(row['pos'])
        entities_info[key]['pages'].add(row['page'])
        
for k in entities_info.keys():
    entities_info[k]['pages'] = sorted(list(entities_info[k]['pages']))
    relations = set()
    for p in entities_info[k]['pages']:
        relations.update(set(entities_by_page[p]))
    entities_info[k]['relations'] = [{'from': k, 'to': r} for r in relations if k != r]

In [646]:
import json

In [647]:
json.dump(entities_info, open('../data/entities_info.json', 'w'))

In [648]:
entities_info['rebecca']

{'first_pos': 431,
 'last_pos': 657,
 'first_page': 49,
 'last_page': 58,
 'type': 'PERSON',
 'pos': [657, 431],
 'pages': [49, 58],
 'name': 'Rebecca',
 'relations': [{'from': 'rebecca', 'to': 'ursula'},
  {'from': 'rebecca', 'to': 'rebeca'},
  {'from': 'rebecca', 'to': 'aureliano jose'},
  {'from': 'rebecca', 'to': 'jose arcadio buendfa'},
  {'from': 'rebecca', 'to': 'pietro'},
  {'from': 'rebecca', 'to': 'crespi'},
  {'from': 'rebecca', 'to': 'amaranta'},
  {'from': 'rebecca', 'to': 'bmno crespi'},
  {'from': 'rebecca', 'to': 'nicanor'},
  {'from': 'rebecca', 'to': 'catarino'},
  {'from': 'rebecca', 'to': 'jose arcadio'},
  {'from': 'rebecca', 'to': 'aureliano'},
  {'from': 'rebecca', 'to': 'arcadio'},
  {'from': 'rebecca', 'to': 'pietro crespi'}]}