## Analysis: Where are the unknowns pointing?

### Data downloads and setup

In [3]:
import yaml
import os
import argparse
import sys
from collections import Counter
from collections import defaultdict

In [4]:
sys.path.insert(0, "../build")

In [5]:
import pybergamot
from pybergamot import Service, Response, ResponseOptions

In [6]:
def build_config(BERGAMOT_ARCHIVE):   
    config = {
       "models": [os.path.join(BERGAMOT_ARCHIVE, "model.intgemm.alphas.bin")],
       "shortlist": [os.path.join(BERGAMOT_ARCHIVE, "lex.s2t.bin"), True, 50, 50],
       "vocabs": [
           os.path.join(BERGAMOT_ARCHIVE, "vocab.deen.spm"),
           os.path.join(BERGAMOT_ARCHIVE, "vocab.deen.spm"),
           ],
       "ssplit-prefix-file": os.path.join(BERGAMOT_ARCHIVE, "nonbreaking_prefix.en"),
       "max-length-break": 128,
       "mini-batch-words": 1024,
       "workspace": 128,
       "skip-cost": True,
       "cpu-threads": 40,
       "quiet": True,
       "quiet-translation": True,
       "gemm-precision": "int8shiftAlphaAll",
       "alignment": True,
       "allow-unk": True,
       "log": "unk-analyis.log",
       "log-level": "debug"
    }
    return config

In [7]:
def build_service(bergamot_path):
    config = build_config(bergamot_path)
    configStr = yaml.dump(config, sort_keys=False)
    service = Service(configStr)
    return service

In [8]:
ENDE_BUNDLE = '../bergamot-translator-tests/models/deen/ende.student.tiny.for.regression.tests/'

## Analysis: Unknowns on MTNT

In [48]:
class Analyzer:
    def __init__(self, service, options):
        self.service = service
        self.opts = options
        
    def unk_mappings(self, content):
        mappings = defaultdict(list)
        response = self.service.translate(content, self.opts)
        for sentenceIdx, alignment in enumerate(response.alignments):
            for point in alignment:
                sbr = response.source.word(sentenceIdx, point.src)
                tbr = response.target.word(sentenceIdx, point.tgt)
                if response.source.isUnknown(sentenceIdx, point.src):
                    mappings[sbr].append(tbr)

        return mappings   
    
    def df(self, unk_mappings):
        global_counter = Counter()
        for k in unk_mappings:
            global_counter += Counter(global_d[k])

        dfdict = {}
        dfdict["names"] = list(global_counter.keys())
        dfdict["occurences"] = list(global_counter.values())
        df = pd.DataFrame(dfdict)
        sorted_df = df.sort_values(by=['occurences'], ascending=False).head(50)
        return sorted_df
        
    def ipydisplay(self, df):
        from IPython.display import display, HTML
        return display(HTML(df.to_html(index=False)))


In [14]:
def mtnt_parallel_en(lang='fr'):
    import pandas as pd
    data = pd.read_csv('MTNT/train/train.en-{}.tsv'.format(lang), sep='\t', 
                       error_bad_lines=False, names=['No', 'src', 'tgt'])
    size = data.size
    source_lines = data['src'].values
    return '\n'.join(source_lines)

def mtnt_monolingual():
    source_lines = []
    with open('MTNT/monolingual/train.en') as fp:
        source_lines = fp.read().splitlines()
    return '\n'.join(source_lines)

In [50]:
service = build_service(ENDE_BUNDLE)

options = ResponseOptions();
options.alignment = True
options.qualityScores = True
options.alignmentThreshold = 0.2
analyzer = Analyzer(service, options)

In [None]:
data = mtnt_parallel_en()
analyzer.unk_mappings(data)

In [19]:
for unk in unique_unks:
    print(unk)

NameError: name 'unique_unks' is not defined

## Naive Algorithm: Replace targets mapping to source-unks

In [9]:

def replace_unk_from_source(response):
    target = response.target.text
    replace_ops = []
    for sentenceIdx, alignment in enumerate(response.alignments):
        for point in alignment:
            if response.source.isUnknown(sentenceIdx, point.src):
                source = response.source.text
                sourceByteRange = response.source.wordAsByteRange(sentenceIdx, point.src)                    
                targetByteRange = response.target.wordAsByteRange(sentenceIdx, point.tgt)
                replace_ops.append((targetByteRange, sourceByteRange))

    replace_ops = sorted(replace_ops, key=lambda x: x[0].begin)
    sourceBytes = bytearray(response.source.text.encode())
    targetBytes = bytearray(response.target.text.encode())
    previous = 0
    replaced = bytearray()
    for tbr, sbr in replace_ops:
        replaced += targetBytes[previous:tbr.begin]
        replaced += sourceBytes[sbr.begin:sbr.end]
        previous = tbr.end

    replaced += targetBytes[previous:]
    return replaced.decode("utf-8")



In [10]:
def count_unks_in_source(response):
    _sum = 0
    for sentenceIdx in range(response.source.numSentences()):
        for wordIdx in range(response.source.numWords(sentenceIdx)):
            _sum += int(response.source.isUnknown(sentenceIdx, wordIdx))
    return _sum

In [12]:
samples = open("examples.txt").read().splitlines()    

service = build_service(ENDE_BUNDLE)
opts = ResponseOptions()
opts.alignment = True
opts.alignmentThreshold = 1.0

for idx, sample in enumerate(samples, 1):
    response = service.translate(sample, opts)
    if count_unks_in_source(response) > 0:
        print("-- Sample ", idx, "----")
        print('[src] > ', sample)
        print('[tgt] < ', replace_unk_from_source(response))
        print()

-- Sample  1 ----
[src] >  Pleading Face is the third most popular emoji used on Twitter, and the most commonly found emoji in tweets that include hearts. Used in sequence with pointing hands to indicate a bashful or shy pose (🥺👉👈) particularly on TikTok.
[tgt] <  Pleading Gesicht ist die drittbeliebteste Emoji auf Twitter verwendet, und die am weiteesten gefunden E-E-Emoji in Tweets, die Herzen einschließen. Verwendet in der Sequenz mit punktierenden Händchen, um eine verschämte oder scheutliche Pose (🥺👉👈 besonders auf TikTok anzudeuten. .

-- Sample  2 ----
[src] >  നിങ്ങളുടെ ഈ വിഡിയോ കണ്ടതിനു ശേഷം Michael jackson ന്റെ ഗാനമാണ് ഓർമ വന്നത് just because you read it in a magazine Or see it on the TV screen Don't make it factual, actual
[tgt] <  Nur weil Sie es auf dem TV-Fernsehlein auf den Fernsehbildschirm lesen. Machen Sie es nicht s sachlich, tatsächlich, aktuell, tatsächlich

-- Sample  3 ----
[src] >  🥺 Face with Pleading Eyes
[tgt] <  🥺 mit plädierenden Sehnsüchten

-- Sample  4 -

In [None]:
samples = sorted(mtnt_monolingual().splitlines(), key=len)

with open("mtnt_monolingual.log", "w+") as fp:
    for idx, sample in enumerate(samples, 1):
        response = service.translate(sample, opts)
        if count_unks_in_source(response) > 0:
            print("-- Sample ", idx, "----", file=fp)
            print('[src] > ', sample, file=fp)
            print('[tgt] < ', replace_unk_from_source(response), file=fp)
            print(file=fp)

In [19]:
samples = ['Yeah they’re gonna itch over that $25 :D']
response = service.translate(samples[0], opts)
print(response.target.text)

Yeah they're gonna ititch over that $25 :D .


In [None]:
def build_tree(referenceHTML:str, node: DOMElement):
    root = TagTree(toByteRange(node))
    for child in node.children():
        subtree = build_tree(referenceHTML, child)
        root.add_subtree(subtree)
    return root