In [None]:
!pip install transformers
import transformers
from transformers import AutoModel, AutoTokenizer

import torch
import pandas as pd

import numpy as np
from numpy.linalg import norm

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import seaborn

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.1 MB/s[0m eta [36m0:00:0

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
class SimilarityAlign(object):
  def __init__(self, model: str="bert-base-multilingual-cased", device: str="cuda", layer: int=8, heatmap=False):
    self.model = model
    self.device = device
    self.layer = layer
    self.heatmap = heatmap

    self.tokenizer = AutoTokenizer.from_pretrained(self.model)
    self.emb_model = AutoModel.from_pretrained(self.model, output_hidden_states=True)
    self.emb_model.eval()
    self.emb_model.to(self.device)

  def process_input(self, src, trg):
    sent_batch = [[src], [trg]]
    with torch.no_grad():
      inputs = self.tokenizer(sent_batch, is_split_into_words=True, padding=True, truncation=True, return_tensors="pt")
      hidden = self.emb_model(**inputs.to(self.device))["hidden_states"]
      outputs = hidden[self.layer]
      outputs = outputs[:, 1:-1, :]
    return outputs

  def cal_similarity(self, sim_input1, sim_input2):
    return np.dot(sim_input1, sim_input2)/(norm(sim_input1)*norm(sim_input2))

  def sim_matrix(self, sim_input1, sim_input2):
    matrix = np.zeros((len(sim_input1), len(sim_input2)))
    for i in range(len(sim_input1)):
      for j in range(len(sim_input2)):
        matrix[i, j] = self.cal_similarity(sim_input1[i], sim_input2[j])
    return matrix

  def argMax(self, numpy_matrix):
    argMax_mat = np.zeros_like(numpy_matrix)
    result = np.zeros_like(numpy_matrix)
    overlapping = []
    for num_row, row in enumerate(numpy_matrix):
      argMax_mat[num_row, np.argmax(row)] = 1
    for num_col, column in enumerate(numpy_matrix.T):
      max_idx = np.argmax(column)
      if argMax_mat[max_idx, num_col] == 0:
        argMax_mat[max_idx, num_col] += 1
      else:
        argMax_mat[max_idx, num_col] += 1
        overlapping.append((max_idx, num_col))
    return overlapping

  def align_sentences(self, src, trg):
    src_sent = src.split()
    trg_sent = trg.split()
    src_tokens = [self.tokenizer.tokenize(word) for word in src_sent]
    trg_tokens = [self.tokenizer.tokenize(word) for word in trg_sent]
    bpe_lists = [[bpe for w in sent for bpe in w] for sent in [src_tokens, trg_tokens]]

    id_sub_src = []
    id_sub_trg = []
    for i, wlist in enumerate(src_tokens):
      for x in wlist:
        id_sub_src.append(i)

    for i, wlist in enumerate(trg_tokens):
      for x in wlist:
        id_sub_trg.append(i)

    outputs = self.process_input(src, trg)
    outputs = [outputs[i, :len(bpe_lists[i])] for i in [0, 1]]

    input1 = outputs[0].cpu().detach().numpy()
    input2 = outputs[1].cpu().detach().numpy()

    data_np = self.sim_matrix(input1, input2)
    argMax_list = self.argMax(data_np)
    align_list = []
    for item in argMax_list:
      wanted_src = id_sub_src[item[0]]
      wanted_trg = id_sub_trg[item[1]]
      if (wanted_src, wanted_trg) not in align_list:
        align_list.append((int(wanted_src), int(wanted_trg)))

    if self.heatmap == True:
      argMax_word_mat = np.zeros((len(src_sent), len(trg_sent)))
      for item in align_list:
          argMax_word_mat[item] = 1
      data_pd = pd.DataFrame(argMax_word_mat, columns = [x for x in trg_sent], index = [x for x in src_sent])
      seaborn.heatmap(data_pd, cmap="crest", linewidth=.5)
    align_list.sort(key = lambda x: x[0])
    return align_list

In [None]:
model = SimilarityAlign()

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

COMPARE TO EN-DEU F1 SCORES

In [None]:
!pip install gdown
import gdown



In [None]:
# path = "https://drive.google.com/file/d/1CUYNb1DY8Onl8HTY0nVsBiKsaTmKdOWJ/view?usp=drive_link"
# file_name = "gold.txt"
# gdown.download(path, file_name, quiet=False,fuzzy=True)
# path = "https://drive.google.com/file/d/1WFLlfSpcIT92_Nev7OUPc2hzf6a6T0M4/view?usp=drive_link"
# file_name = "en.txt"
# gdown.download(path, file_name, quiet=False,fuzzy=True)
# path = "https://drive.google.com/file/d/1ZtIN0RnZKMd2G4zIRNjMr5xC3uu-286m/view?usp=drive_link"
# file_name = "de.txt"
# gdown.download(path, file_name, quiet=False,fuzzy=True)

In [None]:
# gold_file = open("/content/gold.txt", "r")
# p_list = []
# s_list = []
# all_count = 0
# s_count = 0
# current = None

# for line in gold_file:
#   line = line.strip()
#   if "SENT" in line:
#     line = line.split()
#     p_list.append([])
#     s_list.append([])
#     current = int(line[-1])
#   elif line != '':
#     all_count += 1
#     line = line.split()
#     p_list[current].append((int(line[2]), int(line[1])))
#     if line[0] == "S":
#       s_count += 1
#       s_list[current].append((int(line[2]), int(line[1])))
# gold_file.close()

In [None]:
# en = open("/content/en.txt", "r", encoding = "ISO-8859-1")
# de = open("/content/de.txt", "r", encoding = "ISO-8859-1")
# result =[]
# for en_line, de_line in zip(en, de):
#   result.append(model.align_sentences(en_line.strip(), de_line.strip()))
# en.close()
# de.close()

In [None]:
# result.pop(508)

In [None]:
# def calc_f1(result, p_list, s_list, all_count, s_count):
# p_correct = 0.
# s_correct = 0.
# total = 0.

# for i in range(len(result)):
#   p_correct += len(set(p_list[i]) & set(result[i]))
#   s_correct += len(set(s_list[i]) & set(result[i]))
#   total += len(set(result[i]))

# y_prec = round(p_correct / max(total, 1.), 3)
# y_rec = round(s_correct / max(s_count, 1.), 3)
# y_f1 = round(2. * y_prec * y_rec / max((y_prec + y_rec), 0.01), 3)

# y_f1

APPLY TO WIKIPEDIA ENGLISH-HAUSA

In [None]:
path = "https://drive.google.com/file/d/1zIpIIXL4boUosc1WRyM-ACwfMy5CDbSe/view?usp=sharing"
file_name = "wiki.tsv"
gdown.download(path, file_name, quiet=False,fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1zIpIIXL4boUosc1WRyM-ACwfMy5CDbSe
To: /content/wiki.tsv
100%|██████████| 37.0M/37.0M [00:00<00:00, 94.4MB/s]


'wiki.tsv'

In [None]:
import pandas as pd

df = pd.read_csv('wiki.tsv',sep = '\t')
df

Unnamed: 0,id,sourceLanguage,targetLanguage,mt,source.content,target.content,mt.engine,mt.content,mt_to_pe,source_to_mt,source_to_pe,source.content.tok,target.content.tok,mt.content.tok,ter,ter_for_pe
0,746864/2,en,ha,,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Google,Wards A Rano Wards 10 da ke karamar hukumar Ra...,0.967532,0.955128,0.987179,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Wards A Rano Wards 10 da ke karamar hukumar Ra...,S S C S C D C C C C C C C C C C C C C C C C C ...,S S C S C I C C C C C C C C C C C C C C C C C ...
1,746872/1,en,ha,,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma'ilu a cikin jeji .Hotan Françoi...,Google,Hagar da Isma'ilu a cikin jeji ta François-Jos...,0.906667,0.992647,0.900000,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma 'ilu a cikin jeji .Hotan Franço...,Hagar da Isma 'ilu a cikin jeji ta François-Jo...,S C C C C C C S C C C C C S C C D C C C C C C ...,S C C C C C C S C C C C C S C C I S C C C C C ...
2,746872/14,en,ha,,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Google,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,0.958333,0.976285,0.935606,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,C C S C C C C C C C C C C C C C D C C S C S S ...,C C S C C C C C C C C C C C C C I S C C C S S ...
3,746872/27,en,ha,,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971, Hong Kong ya soke babban tsar...",Google,"A shekarar 1971, Hong Kong ya soke babban tsar...",0.954248,0.876712,0.836601,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971 , Hong Kong ya soke babban tsa...","A shekarar 1971 , Hong Kong ya soke babban tsa...",C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...
4,746872/42,en,ha,,"Among the Israelites, men commonly acknowledge...","A cikin Isra’ilawa, maza sun yarda da ƙwaraƙwa...",Google,"A cikin Isra’ilawa, maza sun saba yarda da ƙwa...",0.961240,0.934783,0.898551,"Among the Israelites , men commonly acknowledg...","A cikin Isra ’ ilawa , maza sun yarda da ƙwara...","A cikin Isra ’ ilawa , maza sun saba yarda da ...",C C C C C C C C I C C C C C C C C C C C C C C ...,C C C C C C C C D C C C C C C C C C C C C C C ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26235,2140040/17,en,ha,,"Abdoulaye was born in 1952, in the village of ...",An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,Google,An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,0.993865,0.981928,0.975904,"Abdoulaye was born in 1952 , in the village of...",An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...
26236,2140040/19,en,ha,,Abdoulaye has been married several times and h...,Abdoulaye ya yi aure sau da yawa kuma yana da ...,Google,Abdoulaye ya yi aure sau da yawa kuma yana da ...,0.870968,0.983871,0.885246,Abdoulaye has been married several times and h...,Abdoulaye ya yi aure sau da yawa kuma yana da ...,Abdoulaye ya yi aure sau da yawa kuma yana da ...,C C C C C C C C C C I C C C,C C C C C C C C C C C D C
26237,2140040/5,en,ha,,Abdoulaye's companies have created employment ...,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,Google,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,0.980000,0.915888,0.934579,Abdoulaye 's companies have created employment...,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,C C C C C C C C C C C S C C C C C C C C C C C ...,C C C C C C C C C C C S C C C C C C C C C C C ...
26238,2140040/7,en,ha,,Abdoulaye is also a principal shareholder in I...,Abdoulaye kuma babban mai hannun jari ne a kam...,Google,Abdoulaye kuma babban mai hannun jari ne a kam...,0.995968,0.942966,0.939163,Abdoulaye is also a principal shareholder in I...,Abdoulaye kuma babban mai hannun jari ne a kam...,Abdoulaye kuma babban mai hannun jari ne a kam...,C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...


In [None]:
model = SimilarityAlign(model = "Davlan/afro-xlmr-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def find_inserted_pe(pe):
  pe = pe.split()
  idx = []
  for id, item in enumerate(pe):
    if item == "I":
      idx.append(id)
  return idx

In [None]:
df["insert_pe_idx"] = df.apply(lambda row: find_inserted_pe(row["ter_for_pe"]), axis = 1)
df.head()

Unnamed: 0,id,sourceLanguage,targetLanguage,mt,source.content,target.content,mt.engine,mt.content,mt_to_pe,source_to_mt,source_to_pe,source.content.tok,target.content.tok,mt.content.tok,ter,ter_for_pe,insert_pe_idx
0,746864/2,en,ha,,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Google,Wards A Rano Wards 10 da ke karamar hukumar Ra...,0.967532,0.955128,0.987179,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Wards A Rano Wards 10 da ke karamar hukumar Ra...,S S C S C D C C C C C C C C C C C C C C C C C ...,S S C S C I C C C C C C C C C C C C C C C C C ...,[5]
1,746872/1,en,ha,,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma'ilu a cikin jeji .Hotan Françoi...,Google,Hagar da Isma'ilu a cikin jeji ta François-Jos...,0.906667,0.992647,0.9,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma 'ilu a cikin jeji .Hotan Franço...,Hagar da Isma 'ilu a cikin jeji ta François-Jo...,S C C C C C C S C C C C C S C C D C C C C C C ...,S C C C C C C S C C C C C S C C I S C C C C C ...,"[16, 27, 30]"
2,746872/14,en,ha,,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Google,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,0.958333,0.976285,0.935606,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,C C S C C C C C C C C C C C C C D C C S C S S ...,C C S C C C C C C C C C C C C C I S C C C S S ...,"[16, 38]"
3,746872/27,en,ha,,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971, Hong Kong ya soke babban tsar...",Google,"A shekarar 1971, Hong Kong ya soke babban tsar...",0.954248,0.876712,0.836601,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971 , Hong Kong ya soke babban tsa...","A shekarar 1971 , Hong Kong ya soke babban tsa...",C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...,"[48, 49, 66, 67]"
4,746872/42,en,ha,,"Among the Israelites, men commonly acknowledge...","A cikin Isra’ilawa, maza sun yarda da ƙwaraƙwa...",Google,"A cikin Isra’ilawa, maza sun saba yarda da ƙwa...",0.96124,0.934783,0.898551,"Among the Israelites , men commonly acknowledg...","A cikin Isra ’ ilawa , maza sun yarda da ƙwara...","A cikin Isra ’ ilawa , maza sun saba yarda da ...",C C C C C C C C I C C C C C C C C C C C C C C ...,C C C C C C C C D C C C C C C C C C C C C C C ...,[]


In [None]:
df["align_src_pe"] = df.apply(lambda row: model.align_sentences(row["source.content.tok"], row["target.content.tok"]), axis = 1)
df.head()

Unnamed: 0,id,sourceLanguage,targetLanguage,mt,source.content,target.content,mt.engine,mt.content,mt_to_pe,source_to_mt,source_to_pe,source.content.tok,target.content.tok,mt.content.tok,ter,ter_for_pe,insert_pe_idx,align_src_pe
0,746864/2,en,ha,,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Google,Wards A Rano Wards 10 da ke karamar hukumar Ra...,0.967532,0.955128,0.987179,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Wards A Rano Wards 10 da ke karamar hukumar Ra...,S S C S C D C C C C C C C C C C C C C C C C C ...,S S C S C I C C C C C C C C C C C C C C C C C ...,[5],"[(0, 0), (1, 1), (2, 2), (4, 4), (7, 10), (8, ..."
1,746872/1,en,ha,,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma'ilu a cikin jeji .Hotan Françoi...,Google,Hagar da Isma'ilu a cikin jeji ta François-Jos...,0.906667,0.992647,0.9,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma 'ilu a cikin jeji .Hotan Franço...,Hagar da Isma 'ilu a cikin jeji ta François-Jo...,S C C C C C C S C C C C C S C C D C C C C C C ...,S C C C C C C S C C C C C S C C I S C C C C C ...,"[16, 27, 30]","[(0, 0), (1, 1), (2, 2), (2, 3), (3, 4), (7, 8..."
2,746872/14,en,ha,,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Google,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,0.958333,0.976285,0.935606,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,C C S C C C C C C C C C C C C C D C C S C S S ...,C C S C C C C C C C C C C C C C I S C C C S S ...,"[16, 38]","[(0, 0), (1, 0), (3, 1), (4, 3), (5, 4), (5, 5..."
3,746872/27,en,ha,,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971, Hong Kong ya soke babban tsar...",Google,"A shekarar 1971, Hong Kong ya soke babban tsar...",0.954248,0.876712,0.836601,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971 , Hong Kong ya soke babban tsa...","A shekarar 1971 , Hong Kong ya soke babban tsa...",C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...,"[48, 49, 66, 67]","[(0, 4), (1, 5), (3, 6), (3, 7), (6, 12), (8, ..."
4,746872/42,en,ha,,"Among the Israelites, men commonly acknowledge...","A cikin Isra’ilawa, maza sun yarda da ƙwaraƙwa...",Google,"A cikin Isra’ilawa, maza sun saba yarda da ƙwa...",0.96124,0.934783,0.898551,"Among the Israelites , men commonly acknowledg...","A cikin Isra ’ ilawa , maza sun yarda da ƙwara...","A cikin Isra ’ ilawa , maza sun saba yarda da ...",C C C C C C C C I C C C C C C C C C C C C C C ...,C C C C C C C C D C C C C C C C C C C C C C C ...,[],"[(0, 0), (0, 1), (2, 2), (3, 5), (4, 6), (6, 8..."


In [None]:
df.to_csv("save.tsv", sep="\t")

In [None]:
path = "https://drive.google.com/file/d/1_r8mqydHQav1GpSj_3tw8Er38yE3JZZI/view?usp=sharing"
file_name = "wiki.tsv"
gdown.download(path, file_name, quiet=False,fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1_r8mqydHQav1GpSj_3tw8Er38yE3JZZI
To: /content/wiki.tsv
100%|██████████| 43.9M/43.9M [00:00<00:00, 60.0MB/s]


'wiki.tsv'

In [None]:
import pandas as pd
wiki = pd.read_csv('wiki.tsv',sep = '\t')

In [None]:
wiki.head()

Unnamed: 0.1,Unnamed: 0,id,sourceLanguage,targetLanguage,mt,source.content,target.content,mt.engine,mt.content,mt_to_pe,source_to_mt,source_to_pe,source.content.tok,target.content.tok,mt.content.tok,ter,ter_for_pe,insert_pe_idx,align_src_pe
0,0,746864/2,en,ha,,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Google,Wards A Rano Wards 10 da ke karamar hukumar Ra...,0.967532,0.955128,0.987179,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Wards A Rano Wards 10 da ke karamar hukumar Ra...,S S C S C D C C C C C C C C C C C C C C C C C ...,S S C S C I C C C C C C C C C C C C C C C C C ...,[5],"[(0, 0), (1, 1), (2, 2), (4, 4), (7, 10), (8, ..."
1,1,746872/1,en,ha,,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma'ilu a cikin jeji .Hotan Françoi...,Google,Hagar da Isma'ilu a cikin jeji ta François-Jos...,0.906667,0.992647,0.9,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma 'ilu a cikin jeji .Hotan Franço...,Hagar da Isma 'ilu a cikin jeji ta François-Jo...,S C C C C C C S C C C C C S C C D C C C C C C ...,S C C C C C C S C C C C C S C C I S C C C C C ...,"[16, 27, 30]","[(0, 0), (1, 1), (2, 2), (2, 3), (3, 4), (7, 8..."
2,2,746872/14,en,ha,,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Google,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,0.958333,0.976285,0.935606,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,C C S C C C C C C C C C C C C C D C C S C S S ...,C C S C C C C C C C C C C C C C I S C C C S S ...,"[16, 38]","[(0, 0), (1, 0), (3, 1), (4, 3), (5, 4), (5, 5..."
3,3,746872/27,en,ha,,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971, Hong Kong ya soke babban tsar...",Google,"A shekarar 1971, Hong Kong ya soke babban tsar...",0.954248,0.876712,0.836601,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971 , Hong Kong ya soke babban tsa...","A shekarar 1971 , Hong Kong ya soke babban tsa...",C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...,"[48, 49, 66, 67]","[(0, 4), (1, 5), (3, 6), (3, 7), (6, 12), (8, ..."
4,4,746872/42,en,ha,,"Among the Israelites, men commonly acknowledge...","A cikin Isra’ilawa, maza sun yarda da ƙwaraƙwa...",Google,"A cikin Isra’ilawa, maza sun saba yarda da ƙwa...",0.96124,0.934783,0.898551,"Among the Israelites , men commonly acknowledg...","A cikin Isra ’ ilawa , maza sun yarda da ƙwara...","A cikin Isra ’ ilawa , maza sun saba yarda da ...",C C C C C C C C I C C C C C C C C C C C C C C ...,C C C C C C C C D C C C C C C C C C C C C C C ...,[],"[(0, 0), (0, 1), (2, 2), (3, 5), (4, 6), (6, 8..."


In [None]:
def find_aligned_inserted_src(insert_pe_idx, align_src_pe):
  src_idx = []
  if len(insert_pe_idx) == 0 or len(align_src_pe) == 0:
    return src_idx
  else:
    for wanted in insert_pe_idx:
      for item in align_src_pe:
        if item[1] == wanted:
          src_idx.append(item[0])
    return src_idx

In [None]:
df["src_aligned_idx"] = df.apply(lambda row: find_aligned_inserted_src(row["insert_pe_idx"], row["align_src_pe"]), axis = 1)

In [None]:
df

Unnamed: 0,id,sourceLanguage,targetLanguage,mt,source.content,target.content,mt.engine,mt.content,mt_to_pe,source_to_mt,source_to_pe,source.content.tok,target.content.tok,mt.content.tok,ter,ter_for_pe,insert_pe_idx,align_src_pe,src_aligned_idx
0,746864/2,en,ha,,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Google,Wards A Rano Wards 10 da ke karamar hukumar Ra...,0.967532,0.955128,0.987179,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Wards A Rano Wards 10 da ke karamar hukumar Ra...,S S C S C D C C C C C C C C C C C C C C C C C ...,S S C S C I C C C C C C C C C C C C C C C C C ...,[5],"[(0, 0), (1, 1), (2, 2), (4, 4), (7, 10), (8, ...",[]
1,746872/1,en,ha,,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma'ilu a cikin jeji .Hotan Françoi...,Google,Hagar da Isma'ilu a cikin jeji ta François-Jos...,0.906667,0.992647,0.900000,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma 'ilu a cikin jeji .Hotan Franço...,Hagar da Isma 'ilu a cikin jeji ta François-Jo...,S C C C C C C S C C C C C S C C D C C C C C C ...,S C C C C C C S C C C C C S C C I S C C C C C ...,"[16, 27, 30]","[(0, 0), (1, 1), (2, 2), (2, 3), (3, 4), (7, 8...","[17, 23, 22]"
2,746872/14,en,ha,,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Google,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,0.958333,0.976285,0.935606,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,C C S C C C C C C C C C C C C C D C C S C S S ...,C C S C C C C C C C C C C C C C I S C C C S S ...,"[16, 38]","[(0, 0), (1, 0), (3, 1), (4, 3), (5, 4), (5, 5...","[12, 35]"
3,746872/27,en,ha,,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971, Hong Kong ya soke babban tsar...",Google,"A shekarar 1971, Hong Kong ya soke babban tsar...",0.954248,0.876712,0.836601,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971 , Hong Kong ya soke babban tsa...","A shekarar 1971 , Hong Kong ya soke babban tsa...",C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...,"[48, 49, 66, 67]","[(0, 4), (1, 5), (3, 6), (3, 7), (6, 12), (8, ...",[47]
4,746872/42,en,ha,,"Among the Israelites, men commonly acknowledge...","A cikin Isra’ilawa, maza sun yarda da ƙwaraƙwa...",Google,"A cikin Isra’ilawa, maza sun saba yarda da ƙwa...",0.961240,0.934783,0.898551,"Among the Israelites , men commonly acknowledg...","A cikin Isra ’ ilawa , maza sun yarda da ƙwara...","A cikin Isra ’ ilawa , maza sun saba yarda da ...",C C C C C C C C I C C C C C C C C C C C C C C ...,C C C C C C C C D C C C C C C C C C C C C C C ...,[],"[(0, 0), (0, 1), (2, 2), (3, 5), (4, 6), (6, 8...",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26235,2140040/17,en,ha,,"Abdoulaye was born in 1952, in the village of ...",An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,Google,An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,0.993865,0.981928,0.975904,"Abdoulaye was born in 1952 , in the village of...",An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...,[],"[(0, 2), (1, 1), (2, 1), (3, 3), (4, 5), (6, 6...",[]
26236,2140040/19,en,ha,,Abdoulaye has been married several times and h...,Abdoulaye ya yi aure sau da yawa kuma yana da ...,Google,Abdoulaye ya yi aure sau da yawa kuma yana da ...,0.870968,0.983871,0.885246,Abdoulaye has been married several times and h...,Abdoulaye ya yi aure sau da yawa kuma yana da ...,Abdoulaye ya yi aure sau da yawa kuma yana da ...,C C C C C C C C C C I C C C,C C C C C C C C C C C D C,[],"[(0, 0), (1, 1), (2, 2), (3, 3), (5, 4), (6, 7...",[]
26237,2140040/5,en,ha,,Abdoulaye's companies have created employment ...,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,Google,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,0.980000,0.915888,0.934579,Abdoulaye 's companies have created employment...,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,C C C C C C C C C C C S C C C C C C C C C C C ...,C C C C C C C C C C C S C C C C C C C C C C C ...,[],"[(0, 1), (1, 13), (2, 0), (3, 2), (4, 3), (5, ...",[]
26238,2140040/7,en,ha,,Abdoulaye is also a principal shareholder in I...,Abdoulaye kuma babban mai hannun jari ne a kam...,Google,Abdoulaye kuma babban mai hannun jari ne a kam...,0.995968,0.942966,0.939163,Abdoulaye is also a principal shareholder in I...,Abdoulaye kuma babban mai hannun jari ne a kam...,Abdoulaye kuma babban mai hannun jari ne a kam...,C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...,[],"[(0, 0), (2, 1), (4, 2), (5, 5), (6, 7), (8, 8...",[]


In [None]:
df.to_csv("save.tsv", sep="\t")

In [None]:
def gen_labels(row):
  labels = []
  src = row["source.content.tok"].split()
  for i in range(len(src)):
    if i in row["src_aligned_idx"]:
      labels.append('1')
    else:
      labels.append('0')
  return ' '.join(labels)

In [None]:
df["labels"] = df.apply(lambda row: gen_labels(row), axis = 1)
df

Unnamed: 0,id,sourceLanguage,targetLanguage,mt,source.content,target.content,mt.engine,mt.content,mt_to_pe,source_to_mt,source_to_pe,source.content.tok,target.content.tok,mt.content.tok,ter,ter_for_pe,insert_pe_idx,align_src_pe,src_aligned_idx,labels
0,746864/2,en,ha,,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Google,Wards A Rano Wards 10 da ke karamar hukumar Ra...,0.967532,0.955128,0.987179,Wards In Rano The 10 wards in Rano local gover...,Unguwani a Rano guda 10 ne da ke karamar hukum...,Wards A Rano Wards 10 da ke karamar hukumar Ra...,S S C S C D C C C C C C C C C C C C C C C C C ...,S S C S C I C C C C C C C C C C C C C C C C C ...,[5],"[(0, 0), (1, 1), (2, 2), (4, 4), (7, 10), (8, ...",[],0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,746872/1,en,ha,,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma'ilu a cikin jeji .Hotan Françoi...,Google,Hagar da Isma'ilu a cikin jeji ta François-Jos...,0.906667,0.992647,0.900000,Hagar and Ishmael in the Desert by François-Jo...,Hajara da Isma 'ilu a cikin jeji .Hotan Franço...,Hagar da Isma 'ilu a cikin jeji ta François-Jo...,S C C C C C C S C C C C C S C C D C C C C C C ...,S C C C C C C S C C C C C S C C I S C C C C C ...,"[16, 27, 30]","[(0, 0), (1, 1), (2, 2), (2, 3), (3, 4), (7, 8...","[17, 23, 22]",0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0
2,746872/14,en,ha,,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Google,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,0.958333,0.976285,0.935606,There are early records of concubines allegedl...,Akwai bayanan jita-jita na wasu ƙwaraƙwaran da...,Akwai bayanan farko na wasu ƙwaraƙwaran da aka...,C C S C C C C C C C C C C C C C D C C S C S S ...,C C S C C C C C C C C C C C C C I S C C C S S ...,"[16, 38]","[(0, 0), (1, 0), (3, 1), (4, 3), (5, 4), (5, 5...","[12, 35]",0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ...
3,746872/27,en,ha,,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971, Hong Kong ya soke babban tsar...",Google,"A shekarar 1971, Hong Kong ya soke babban tsar...",0.954248,0.876712,0.836601,Hong Kong officially abolished the Great Qing ...,"A shekarar 1971 , Hong Kong ya soke babban tsa...","A shekarar 1971 , Hong Kong ya soke babban tsa...",C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...,"[48, 49, 66, 67]","[(0, 4), (1, 5), (3, 6), (3, 7), (6, 12), (8, ...",[47],0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,746872/42,en,ha,,"Among the Israelites, men commonly acknowledge...","A cikin Isra’ilawa, maza sun yarda da ƙwaraƙwa...",Google,"A cikin Isra’ilawa, maza sun saba yarda da ƙwa...",0.961240,0.934783,0.898551,"Among the Israelites , men commonly acknowledg...","A cikin Isra ’ ilawa , maza sun yarda da ƙwara...","A cikin Isra ’ ilawa , maza sun saba yarda da ...",C C C C C C C C I C C C C C C C C C C C C C C ...,C C C C C C C C D C C C C C C C C C C C C C C ...,[],"[(0, 0), (0, 1), (2, 2), (3, 5), (4, 6), (6, 8...",[],0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26235,2140040/17,en,ha,,"Abdoulaye was born in 1952, in the village of ...",An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,Google,An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,0.993865,0.981928,0.975904,"Abdoulaye was born in 1952 , in the village of...",An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,An haifi Abdoulaye a shekarar 1952 a kauyen Mé...,C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...,[],"[(0, 2), (1, 1), (2, 1), (3, 3), (4, 5), (6, 6...",[],0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
26236,2140040/19,en,ha,,Abdoulaye has been married several times and h...,Abdoulaye ya yi aure sau da yawa kuma yana da ...,Google,Abdoulaye ya yi aure sau da yawa kuma yana da ...,0.870968,0.983871,0.885246,Abdoulaye has been married several times and h...,Abdoulaye ya yi aure sau da yawa kuma yana da ...,Abdoulaye ya yi aure sau da yawa kuma yana da ...,C C C C C C C C C C I C C C,C C C C C C C C C C C D C,[],"[(0, 0), (1, 1), (2, 2), (3, 3), (5, 4), (6, 7...",[],0 0 0 0 0 0 0 0 0 0 0
26237,2140040/5,en,ha,,Abdoulaye's companies have created employment ...,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,Google,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,0.980000,0.915888,0.934579,Abdoulaye 's companies have created employment...,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,Kamfanonin Abdoulaye sun samar da ayyukan yi d...,C C C C C C C C C C C S C C C C C C C C C C C ...,C C C C C C C C C C C S C C C C C C C C C C C ...,[],"[(0, 1), (1, 13), (2, 0), (3, 2), (4, 3), (5, ...",[],0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
26238,2140040/7,en,ha,,Abdoulaye is also a principal shareholder in I...,Abdoulaye kuma babban mai hannun jari ne a kam...,Google,Abdoulaye kuma babban mai hannun jari ne a kam...,0.995968,0.942966,0.939163,Abdoulaye is also a principal shareholder in I...,Abdoulaye kuma babban mai hannun jari ne a kam...,Abdoulaye kuma babban mai hannun jari ne a kam...,C C C C C C C C C C C C C C C C C C C C C C C ...,C C C C C C C C C C C C C C C C C C C C C C C ...,[],"[(0, 0), (2, 1), (4, 2), (5, 5), (6, 7), (8, 8...",[],0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [None]:
df.to_csv("label_aligned_src.tsv", sep="\t")