In [None]:
#!pip install hanlp
#!pip install simalign

In [226]:
import os, csv
# public
import hanlp
import pandas as pd
from tqdm import tqdm
from simalign import SentenceAligner
# from google.cloud import translate_v2 as translate

In [289]:
src_lan = 'en'
# target language
tgt_lan = 'ES'
language = "Spanish"

RESOURCE = 'res'
RESULTS = os.path.join(RESOURCE, 'results')
TRANS_FILE = os.path.join(RESULTS, 'sentences-'+language+'-Translations.tsv')
TOKENS_FILE = os.path.join(RESULTS, 'tokens-'+language+'.tsv')
ALIGN_SENT_FILE = os.path.join(RESULTS, 'sentences_aligned-'+language+'.tsv')
ALIGN_TOKEN_FILE = os.path.join(RESULTS, 'tokens_aligned-'+language+'.tsv')

DATA = os.path.join(RESOURCE, 'data')
KEY = os.path.join(RESOURCE, 'key')
RAW_SENSE_TSV = os.path.join(DATA, 'process_s15.tsv')
RAW_TOKENS_TSV =  os.path.join(DATA, 'tokens-English.tsv')
RAW_SENTENCE_TSV = os.path.join(DATA, 'sentences-English.tsv')
JSON_GOOGLE_APPLICATION_CREDENTIALS = os.path.join(KEY, 'tonal-works-420505-eda807c7cc52.json')
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = JSON_GOOGLE_APPLICATION_CREDENTIALS


# DATA

In [290]:
# raw
raw_sense_df = pd.read_csv(RAW_SENSE_TSV, delimiter='\t', header=None, quoting=csv.QUOTE_NONE)
raw_sense_df.head()

Unnamed: 0,0,1,2,3,4
0,d001.s001.t001,This,,X,
1,d001.s001.t002,document,document,N,bn:00028015n
2,d001.s001.t003,is,be,V,
3,d001.s001.t004,a,,X,
4,d001.s001.t005,summary,summary,N,bn:00075142n


In [291]:
sentence_df = pd.read_csv(RAW_SENTENCE_TSV, delimiter='\t', header=None, quoting=csv.QUOTE_NONE)
sentence_df.head()
sents = sentence_df[1].tolist()
sents

['English',
 'This document is a summary of the European Public Assessment Report (EPAR).',
 'It explains how the Committee for Medicinal Products for Human Use (CHMP) assessed the studies performed, to reach their recommendations on how to use the medicine.',
 'If you need more information about your medical condition or your treatment, read the Package Leaflet (also part of the EPAR) or contact your doctor or pharmacist.',
 'If you want more information on the basis of the CHMP recommendations, read the Scientific Discussion (also part of the EPAR).',
 'What is Alimta?',
 'Alimta is a powder that is made up into a solution for infusion (drip into a vein).',
 'It contains the active substance pemetrexed.',
 'What is Alimta used for?',
 'Alimta is used to treat two types of cancer: pleural mesothelioma (cancer of the lining of the lungs that is usually caused by exposure to asbestos).',
 'Alimta is used together with cisplatin (another anticancer medicine) when the cancer is unresectab

In [292]:
sids = sentence_df[0].tolist()
# sids = [i.replace('d', 'i') for i in sids]
# sids[:4]

In [293]:
trans_df = pd.read_csv(TRANS_FILE, delimiter='\t', quoting=csv.QUOTE_NONE)
trans_df.head()

Unnamed: 0,ID,English,Spanish Translation
0,es001.s001,This document is a summary of the European Pub...,Este documento es un resumen del Informe Públi...
1,es001.s002,It explains how the Committee for Medicinal Pr...,Explica cómo el Comité de Medicamentos de Uso ...
2,es001.s003,If you need more information about your medica...,Si necesita más información sobre su condición...
3,es001.s004,If you want more information on the basis of t...,Si desea obtener más información sobre la base...
4,es001.s005,What is Alimta ?,¿Qué es Alimta?


In [294]:
tkns_df = pd.read_csv(TOKENS_FILE, delimiter='\t', quoting=csv.QUOTE_NONE)
tkns_df.head()

Unnamed: 0,Token ID,Token,Lemma,POS
0,es001.s001.t001,Este,este,DET
1,es001.s001.t002,documento,documento,NOUN
2,es001.s001.t003,es,ser,AUX
3,es001.s001.t004,un,uno,DET
4,es001.s001.t005,resumen,resumen,NOUN


In [295]:
tgt_tkns = tkns_df['Token'].tolist()
len(tgt_tkns)

2787

# ALIGNMENT

In [296]:
device = 'cpu'
layer = 8
# "bert": "bert-base-multilingual-cased",
# "xlmr": "xlm-roberta-base"
model = 'xlmr'
# mwmf (Match), inter (ArgMax), itermax (IterMax)
mode = 'itermax'
matching_methods = 'mai'
token_type='bpe'
aligner = SentenceAligner(
    model=model
    , token_type=token_type
    , matching_methods=matching_methods
    , layer=layer
    , device=device
)

2024-07-04 19:57:11,010 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: xlm-roberta-base


In [297]:
# english token sequences
idxes = raw_sense_df[0].tolist()
tokens = raw_sense_df[1].tolist()

en_sents = []
pre_sid = None
s = []
for i, t in zip(idxes, tokens):
    sid = i.split('.')[1]
    if sid != pre_sid:
        if s:
            en_sents.append(s)
        pre_sid = sid
        s = [t]
    else:
        s.append(t)
en_sents.append(s)
len(en_sents)

138

In [298]:
print(en_sents[92])

['The', 'Foundation', 'has', 'recently', 'been', 'involved', 'in', 'organising', 'a', 'conference', 'which', 'dealt', 'with', 'this', 'critical', 'challenge', '.']


In [299]:
# Italian token sequences
idxes = tkns_df['Token ID'].tolist()
tokens = tkns_df['Token'].tolist()

tgt_sents = []
pre_sid = None
s = []
for i, t in zip(idxes, tokens):
    sid = i.split('.')[1]
    if sid != pre_sid:
        if s:
            tgt_sents.append(s)
        pre_sid = sid
        s = [t]
    else:
        s.append(t)
tgt_sents.append(s)
len(tgt_sents)

138

In [300]:
print(tgt_sents[92])

['La', 'Fundación', 'participó', 'recientemente', 'en', 'la', 'organización', 'de', 'una', 'conferencia', 'que', 'abordó', 'este', 'desafío', 'crítico', '.']


In [301]:
# do the alignment
aligns = []
for en_s, tgt_s in zip(tqdm(en_sents), tgt_sents):
    alignments = aligner.get_word_aligns(en_s, tgt_s)
    align = ''
    for w1, w2 in alignments[mode]:
        align += f'{w1}-{w2} '
    align = align.strip()
    aligns.append(align)

100%|██████████| 138/138 [00:49<00:00,  2.79it/s]


In [302]:
aligns[0]

'0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-8 8-7 9-10 10-6 11-11 12-12 13-13 14-14'

In [303]:
# output
trans_df['English-'+language+' Alignment'] = aligns
trans_df.head()

Unnamed: 0,ID,English,Spanish Translation,English-Spanish Alignment
0,es001.s001,This document is a summary of the European Pub...,Este documento es un resumen del Informe Públi...,0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-8 8-7 9-10 10-6 ...
1,es001.s002,It explains how the Committee for Medicinal Pr...,Explica cómo el Comité de Medicamentos de Uso ...,1-0 2-1 3-2 4-3 5-4 6-5 6-8 7-5 8-6 9-8 10-7 1...
2,es001.s003,If you need more information about your medica...,Si necesita más información sobre su condición...,0-0 1-12 2-1 3-2 4-3 5-4 6-5 7-7 8-6 9-8 10-9 ...
3,es001.s004,If you want more information on the basis of t...,Si desea obtener más información sobre la base...,0-0 2-1 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-12 11-1...
4,es001.s005,What is Alimta ?,¿Qué es Alimta?,0-1 1-2 2-3 3-4


In [304]:
trans_df.to_csv(ALIGN_SENT_FILE, sep='\t', index=False)

# ALIGN TOKENS IN TOKEN FILE

In [305]:
tkns_df.head()


Unnamed: 0,Token ID,Token,Lemma,POS
0,es001.s001.t001,Este,este,DET
1,es001.s001.t002,documento,documento,NOUN
2,es001.s001.t003,es,ser,AUX
3,es001.s001.t004,un,uno,DET
4,es001.s001.t005,resumen,resumen,NOUN


In [306]:
aligned_tks_df = pd.read_csv(RAW_TOKENS_TSV, delimiter='\t', quoting=csv.QUOTE_NONE)
aligned_tks_df.head()

Unnamed: 0,Token ID,Token,Lemma,POS,Sense
0,d001.s001.t001,This,,X,
1,d001.s001.t002,document,document,N,bn:00028015n
2,d001.s001.t003,is,be,V,
3,d001.s001.t004,a,,X,
4,d001.s001.t005,summary,summary,N,bn:00075142n


In [307]:
aligned_tks_df['Token ID '+tgt_lan] = None
aligned_tks_df['Token '+tgt_lan] = None
aligned_tks_df['Lemma '+tgt_lan] = None
aligned_tks_df['POS '+tgt_lan] = None
aligned_tks_df['Sense '+tgt_lan] = None

In [308]:
aligned_tks_df.head()

Unnamed: 0,Token ID,Token,Lemma,POS,Sense,Token ID ES,Token ES,Lemma ES,POS ES,Sense ES
0,d001.s001.t001,This,,X,,,,,,
1,d001.s001.t002,document,document,N,bn:00028015n,,,,,
2,d001.s001.t003,is,be,V,,,,,,
3,d001.s001.t004,a,,X,,,,,,
4,d001.s001.t005,summary,summary,N,bn:00075142n,,,,,


In [309]:
trans_df.head()

Unnamed: 0,ID,English,Spanish Translation,English-Spanish Alignment
0,es001.s001,This document is a summary of the European Pub...,Este documento es un resumen del Informe Públi...,0-0 1-1 2-2 3-3 4-4 5-5 6-6 7-8 8-7 9-10 10-6 ...
1,es001.s002,It explains how the Committee for Medicinal Pr...,Explica cómo el Comité de Medicamentos de Uso ...,1-0 2-1 3-2 4-3 5-4 6-5 6-8 7-5 8-6 9-8 10-7 1...
2,es001.s003,If you need more information about your medica...,Si necesita más información sobre su condición...,0-0 1-12 2-1 3-2 4-3 5-4 6-5 7-7 8-6 9-8 10-9 ...
3,es001.s004,If you want more information on the basis of t...,Si desea obtener más información sobre la base...,0-0 2-1 3-3 4-4 5-5 6-6 7-7 8-8 9-9 10-12 11-1...
4,es001.s005,What is Alimta ?,¿Qué es Alimta?,0-1 1-2 2-3 3-4


In [310]:
def process_nan(s):
  if type(s)!=type(''):
    return ''
  return s

In [311]:
for i, row in trans_df.iterrows():
    sent_i = row['ID']
    # print(sent_i)
    tgt_sent_i = sent_i.replace('d', "es")
    sent_i = sent_i.replace("es","d")
    for alignment in row['English-'+language+' Alignment'].split():
        w1, w2 = alignment.split('-')
        en_id = f'{sent_i}.t{"%03d" % (int(w1)+1)}'
        tgt_id = f'{tgt_sent_i}.t{"%03d" % (int(w2)+1)}'
        row_en = aligned_tks_df[aligned_tks_df['Token ID'] == en_id]
        row_tgt = tkns_df[tkns_df['Token ID'] == tgt_id]
        # print(aligned_tks_df['Token ID'] )
        # print(row_tgt)
        # print(row_en.iloc[0]['Token ID '+tgt_lan])
        if row_en.iloc[0]['Token ID '+tgt_lan] is not None:
            aligned_tks_df.loc[row_en.index, 'Token ID '+tgt_lan] += '➕' + row_tgt.iloc[0]['Token ID']
            aligned_tks_df.loc[row_en.index, 'Token '+tgt_lan] += '➕' + row_tgt.iloc[0]['Token']
            if aligned_tks_df.loc[row_en.index, 'POS '+tgt_lan].item() != process_nan(row_tgt.iloc[0]['POS']):
                aligned_tks_df.loc[row_en.index, 'POS '+tgt_lan] += '➕' + process_nan(row_tgt.iloc[0]['POS'])
            if aligned_tks_df.loc[row_en.index, 'Sense '+tgt_lan].item() != process_nan(row_en.iloc[0]['Sense']):
                aligned_tks_df.loc[row_en.index, 'Sense '+tgt_lan] += '➕' + process_nan(row_en.iloc[0]['Sense'])
        else:
            print(row_en.index.values)
            aligned_tks_df.loc[row_en.index.values[0], 'Token ID '+tgt_lan] = row_tgt.iloc[0]['Token ID']
            # print(aligned_tks_df.loc[row_en.index, 'Token ID '+tgt_lan],row_tgt.iloc[0]['Token ID'],'\n\n')
    
            aligned_tks_df.loc[row_en.index.values[0], 'Token '+tgt_lan] = row_tgt.iloc[0]['Token']
            aligned_tks_df.loc[row_en.index.values[0], 'Lemma '+tgt_lan] = row_tgt.iloc[0]['Lemma']
            aligned_tks_df.loc[row_en.index.values[0], 'POS '+tgt_lan] = row_tgt.iloc[0]['POS']
            aligned_tks_df.loc[row_en.index.values[0], 'Sense '+tgt_lan] = row_en.iloc[0]['Sense']

[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
[10]
[11]
[12]
[13]
[14]
[16]
[17]
[18]
[19]
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
[30]
[31]
[32]
[34]
[35]
[36]
[37]
[38]
[39]
[40]
[41]
[42]
[43]
[44]
[45]
[46]
[47]
[48]
[49]
[50]
[51]
[52]
[53]
[54]
[55]
[56]
[57]
[58]
[59]
[60]
[62]
[63]
[64]
[66]
[67]
[68]
[69]
[70]
[71]
[72]
[73]
[74]
[75]
[76]
[78]
[79]
[80]
[81]
[82]
[83]
[84]
[85]
[86]
[87]
[88]
[89]
[90]
[91]
[92]
[93]
[94]
[95]
[96]
[98]
[99]
[100]
[101]
[102]
[103]
[104]
[105]
[106]
[107]
[108]
[109]
[110]
[111]
[112]
[113]
[114]
[115]
[116]
[117]
[118]
[119]
[120]
[121]
[122]
[123]
[124]
[125]
[126]
[127]
[128]
[129]
[130]
[131]
[132]
[133]
[134]
[135]
[136]
[137]
[138]
[139]
[140]
[141]
[142]
[143]
[144]
[145]
[146]
[147]
[148]
[149]
[150]
[151]
[153]
[154]
[155]
[156]
[157]
[158]
[159]
[160]
[161]
[162]
[163]
[164]
[165]
[166]
[167]
[168]
[169]
[170]
[171]
[172]
[173]
[174]
[175]
[176]
[177]
[178]
[179]
[180]
[181]
[182]
[183]
[184]
[185]
[186]
[187]
[188]
[189]
[190]


In [312]:
# aligned_tks_df[]
aligned_tks_df.head(20)

Unnamed: 0,Token ID,Token,Lemma,POS,Sense,Token ID ES,Token ES,Lemma ES,POS ES,Sense ES
0,d001.s001.t001,This,,X,,es001.s001.t001,Este,este,DET,
1,d001.s001.t002,document,document,N,bn:00028015n,es001.s001.t002,documento,documento,NOUN,bn:00028015n
2,d001.s001.t003,is,be,V,,es001.s001.t003,es,ser,AUX,
3,d001.s001.t004,a,,X,,es001.s001.t004,un,uno,DET,
4,d001.s001.t005,summary,summary,N,bn:00075142n,es001.s001.t005,resumen,resumen,NOUN,bn:00075142n
5,d001.s001.t006,of,,X,,es001.s001.t006,del,del,ADP,
6,d001.s001.t007,the,,X,,es001.s001.t007,Informe,Informe,PROPN,
7,d001.s001.t008,European,european,J,bn:00102440a,es001.s001.t009,Europeo,Europeo,PROPN,bn:00102440a
8,d001.s001.t009,Public,public,J,bn:00109211a,es001.s001.t008,Público,Público,PROPN,bn:00109211a
9,d001.s001.t010,Assessment,assessment,N,bn:00006502n,es001.s001.t011,Evaluación,Evaluación,PROPN,bn:00006502n


In [313]:
aligned_tks_df.to_csv(ALIGN_TOKEN_FILE, sep='\t', index=False)