Tokens are grouped in enumerated sentences in order to perform sentence alignment. For each book, sentences are saved in a separated file. Moreover, the code assigns a number (_loc_ent_id_) to each LOC entity (B-LOC and I-LOC) in the NH. In the case the loc_ent_id is associated with a ToposText ID, a query is made to get also the TrismegistosPlace ID.

In [1]:
import pandas as pd
import re
from tqdm import tqdm

In [2]:
path = "data/intermediate/flairToposText_NH.csv"

In [3]:
## open the file containing flairNER annotation in the NH
flairNER_NH = pd.read_csv(path, delimiter = ',')

In [4]:
flairNER_NH.head(60)

Unnamed: 0.1,Unnamed: 0,reference,index,token,start_pos,flair_ner,topostext_id
0,0,0.1,0,Pliny,0,O,-
1,1,0.1,1,the,6,O,-
2,2,0.1,2,Elder,10,O,-
3,3,0.1,3,",",15,O,-
4,4,0.1,4,The,17,O,-
5,5,0.1,5,Natural,21,O,-
6,6,0.1,6,History,29,O,-
7,7,0.1,7,(,37,O,-
8,8,0.1,8,Books,38,O,-
9,9,0.1,9,1-11,44,O,-


In [5]:
book_column = [] ## extract the book reference
pattern = r'(\d+)(?:\.(\d))?\.'

for i, reference in enumerate(flairNER_NH['reference']):
    try  : 
        match = re.search(pattern,reference)
        book = int(match[1])
    except : book = 11 
    book_column.append(book)
        
flairNER_NH['book'] = book_column

In [6]:
flairNER_NH['book'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37], dtype=int64)

In [7]:
## exclude book 0 which contains a preface to the NH
flairNER_NH = flairNER_NH.loc[~(flairNER_NH ['book'] == 0)]
flairNER_NH.reset_index(inplace=True)
flairNER_NH = flairNER_NH.drop('level_0', axis=1)

In [8]:
flairNER_NH[flairNER_NH.isnull().any(axis=1)]
## we observed that 'None' tokens were interpreted as None elements
flairNER_NH['token'] = flairNER_NH['token'].fillna('None')
#flairNER_NH.token.fillna('None', inplace=True) 

In [9]:
punctuations = ['.', '!', '?'] ## set the punctuation marks

Starting from sentence number = 0, each token is assigned to the sentence number inside the book. If the token is a full stop, an exclamation mark or a question mark, the following token has sentence number = sentence number + 1.

In [10]:
sentence_column = []

for i1 in range(1,38): ## for each book of the NH
        
    filter_book = flairNER_NH[flairNER_NH['book'] == i1] ## select all the tokens in the book
    filter_book = filter_book.reset_index()
    
    count_sentence = 0 ## set the sentence number to 0
    
    for i2,token in enumerate(filter_book['token']):
        sentence_column.append(count_sentence) ## assign the token to the sentence number     
        if token in punctuations: ## if the token is a punctuation sign increment the sentence number
            count_sentence = count_sentence + 1

In [11]:
flairNER_NH['sentence'] = sentence_column

In [12]:
flairNER_NH.to_csv(r"data/intermediate/NH_eng_groupedsentences.csv", index=False)

In [13]:
## test
## filter all the rows containing sentence = 0 in book 1
filter_book1 = flairNER_NH[flairNER_NH['book'] == 1]
print_sent_0 = filter_book1[filter_book1['sentence'] == 0]

## concatenate the tokens in print_sent_0
concatenate_string = ' '.join(print_sent_0['token'].astype(str))
concatenate_string

'PREFACE IN THE FORM OF A LETTER : PLINIUS SECUNDUS TO HIS DEAR VESPASIAN , GREETING MOST Gracious Highness ( let this title , a supremely true one , be yours , while that of \' Most Eminent \' grows to old age with your sire ) — I have resolved to recount to you , in a somewhat presumptuous letter , The offspring of my latest travail , my volumes of Natural History ( a novel task for the native Muses of your Roman citizens ) — For \' twas e\'er your way , To deem my trifles something worth — to give a passing touch of polish to my " opposite number " — you recognize even this service slang — Catullus ( for he , as you know , by interchanging the first syllables made himself a trifle harsher than he wished to be considered by his \' darling Veraniuses and Fabulluses \') and at the same time that my present sauciness may effect what in the case of another impudent letter of mine lately you complained of as not coming off — that it may result in something getting done , and everyone may 

In [14]:
## write files containing the sentences
for i in range(1,38): ## for each book of the NH
        
    filter_book = flairNER_NH[flairNER_NH['book'] == i] ## select all the tokens in the book
    name='data/intermediate/books/engbook'+str(i)
    with open(name, "w", encoding='utf-8') as file: ## create a file for the book
        for i in range(0, filter_book['sentence'].max()+1): ## for each sentence in the book
            filter_rows = filter_book[filter_book['sentence'] == i] ## select all the tokens in the sentence
            concatenate_sentence = ' '.join(filter_rows['token'].astype(str)) ## concatenate the tokens in a sentence
            file.write(f"{concatenate_sentence}\n") ## write the sentence in the file

In [15]:
flairNER_NH['loc_ent_id'] = '-' ## create a new column for the id of the LOC entity

In [16]:
count_loc = -1
for i,loc in enumerate(flairNER_NH['flair_ner']):
    if 'LOC' in loc: ## for each LOC entity
        if loc == 'B-LOC': ## if it is a B-LOC entity
            count_loc = count_loc+1 ## increment + 1
            flairNER_NH.loc[i,'loc_ent_id'] = count_loc ## assign the LOC id
        else : flairNER_NH.loc[i, 'loc_ent_id'] = count_loc ## assign the LOC id

In [17]:
loc_ent_ids = [element for element in flairNER_NH['loc_ent_id'].unique() if isinstance(element, int)]

In [None]:
loc_ent_column = []
topostext_id = []

for n in tqdm(range(min(loc_ent_ids), max(loc_ent_ids)+1)):
    loc_ent_column.append(n)
    #print(n)
    filter_loc_entity = flairNER_NH[flairNER_NH['loc_ent_id'] == n] ## select all the tokens in the entity
    filter_loc_entity = filter_loc_entity.reset_index()
    for i,token in enumerate(filter_loc_entity['token']):
        
        flag = False
        
        if filter_loc_entity['topostext_id'][i] != '-':
            
            topostext_id.append(filter_loc_entity['topostext_id'][i])
            #print(n, filter_loc_entity['topostext_id'][i])
            flag = True
            break
            
    if flag == False:
        topostext_id.append('-')
        #print(n, filter_loc_entity['topostext_id'][i])

 45%|██████████████████████████████████▋                                          | 6285/13964 [05:26<06:14, 20.52it/s]

In [None]:
loc_ent_id = pd.DataFrame(
{'loc_ent_id': loc_ent_column,
'topostext_id': topostext_id
})

In [None]:
loc_ent_id.to_csv('loc_ent_ids.csv', index=False)

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
#TODO move to a later phase. 
## import the trismegistosplace id 
topostext_id_column=[]
trismegistos_id_column=[]

requestamount = len(list(enumerate(loc_ent_id.topostext_id.unique())))
for i, topostext_id in tqdm(enumerate(loc_ent_id.topostext_id.unique()), total = requestamount): ## for each topostext_id
    #print(i)
    if topostext_id !='-': ## excluding the case of missing topostext_id
        
        topostext_id_column.append(topostext_id)
        temporary_list_TM=[]
        
        ## navigate the ToposText page of the place name
        topostext_page_name = 'https://topostext.org/place/'+str(topostext_id)
        topostext_page=requests.get(topostext_page_name)
        soup=BeautifulSoup(topostext_page.content)
        
        for a_tag in soup.find_all('a'):
            if "Trismegistos" in a_tag.contents[0]: ## if the TM id is present in the page
                trismegistos_id=a_tag.contents[0]
                temporary_list_TM.append(trismegistos_id)
    
        trismegistos_id_column.append(temporary_list_TM)

In [None]:
topostext_trismegistos = pd.DataFrame(
{'topostext_id': topostext_id_column,
'trismegistos_id': trismegistos_id_column
})

In [None]:
topostext_trismegistos

In [None]:
trismegistos_id_column = []
pattern = r': (.*)$'

for i, trismegistos_id in enumerate(topostext_trismegistos['trismegistos_id']):
    if len(trismegistos_id) == 1:
        for TM_id in trismegistos_id:
            match = re.search(pattern, TM_id)
            TM_id = match.group(1)
            print(TM_id)
    else: TM_id = '-'
        
    trismegistos_id_column.append(TM_id)

In [None]:
topostext_trismegistos['trismegistos_id'] = trismegistos_id_column

In [None]:
topostext_trismegistos

In [None]:
#topostext_trismegistos = topostext_trismegistos.drop('topostext_trismegistos', axis=1)

In [None]:
topostext_trismegistos.to_csv('data/intermediate/topostext_trismegistos_ids.csv', index=False)