In [1]:
import pandas as pd
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer
from camel_tools.tokenizers.word import simple_word_tokenize
from tqdm import tqdm
import re
import string

In [2]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [4]:
#Read in sentence alignment file
lines=pd.read_excel('Omar_bin_Seid_alignment.xlsx','Logical_Breaks')

In [5]:
#set punctuation, augmented to include unique punctuation
punct_list=string.punctuation+'∵'

#create table with core values for Perseus token system
df = pd.DataFrame(columns = ['value', 'punctuation', 'space_after', 'position', 've_ref','idx'])

#CAMel Labs parser and morphological tokenizer intialization

#[CAMeL Tools: An Open Source Python Toolkit for Arabic Natural Language Processing]
#(https://aclanthology.org/2020.lrec-1.868) (Obeid et al., LREC 2020)

mle_msa = MLEDisambiguator.pretrained('calima-msa-r13')
msa_atb_tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, scheme='atbtok')

#part trackers
chunk = 0
sent = 0
idx=0

#assumes section and page breaks marked in excel file and skips those markers
for sent_id, line in enumerate(lines['Arabic']):
    if(re.search('PAGE', line) is not None or line == 'SECTION BREAK'):
        chunk += 1
        sent = 1
    else:
        #tokenize Arabic line
        simp_tok= simple_word_tokenize(line)
        morph_tok = msa_atb_tokenizer.tokenize(simp_tok)
        
        #subtoken handling, seperating attached subtokens from CAMel tools tokens
        tokens=list()
        for token in morph_tok: 
            if (re.search('_+', token) is not None):
                
                sub_tokens_first=token.split('+_')
                sub_tokens_second=list()
                for sub_token in sub_tokens_first:
                    sub_tokens_second.append(sub_token.split('_+'))
                sub_tokens=flatten(sub_tokens_second)
                
                #handling need for space indictators between subword tokens
                for i in range(0,len(sub_tokens)):        
                    if i == len(sub_tokens)-1:
                        tokens.append(sub_tokens[i])
                    else:
                        tokens.append(sub_tokens[i]+"*")
            else:
                tokens.append(token)
        
        #mark punctuation tokens in table
        for position, token in enumerate(tokens):
            #punctuation
            punct=''
            if token in punct_list:
                punct='y'
                df.loc[idx-1, ['space_after']] = ['n']
            else:
                punct=''
                
            #leading subtoken in joint token
            if "*" in token:
                space='n'
                token=token.replace('*', '')
            else:
                space=''
            
            #add to table
            df = df.append({'value' : token, 'punctuation' : punct, 'space_after' : space, 'position': position+1, 've_ref': str(chunk)+'.'+str(sent)+'.t'+str(position+1), 'idx': idx},
                ignore_index = True)
            idx+=1
        sent += 1
#save token table to csv
df.to_csv('bin_said_tokens_ar.csv',index=False)