# Piemanese TM Experiment: Heuristics
## Overview
This is an experiment for finding heuristics for detecting and generating Piemanese word pairs. This can help in the synthetic data generation process for creating Piemanese-English pairs, or augmenting existing ones.

In [1]:
import pandas as pd

def load_dataset(word_pairs_dir):
    rows = []
    for file in ['benchmark.tsv', 'replacements.tsv', 'manual_upweight.tsv']:
        with open(f'{word_pairs_dir}/{file}', 'r', encoding='utf-8') as f:
            rows += [line.strip().split('\t') for line in f]
    return rows

word_pairs = load_dataset('../datasets/word_pairs/true')  # local path
word_pairs

[['neda', 'need to'],
 ['deliva', 'deliver'],
 ['foods', 'food'],
 ['at', 'at'],
 ['yezzirr', 'yessir'],
 ['u', 'you'],
 ['fet', 'fat'],
 ['berx2', 'bye'],
 ['stut', 'stuart'],
 ['dum', 'dumb'],
 ['nu', 'no'],
 ['u', 'you'],
 ['o', 'oh'],
 ['gawd', 'god'],
 ['nufin', 'nothing'],
 ['to', 'to'],
 ['translate', 'translate'],
 ['mai', 'my'],
 ['englando', 'english'],
 ['es', 'is'],
 ['too', 'too'],
 ['gud', 'good'],
 ['i', 'i'],
 ['ges', 'guess'],
 ['i', 'i'],
 ['cn', 'can'],
 ['liftu', 'lift'],
 ['a', 'a'],
 ['beet', 'bit'],
 ['o', 'oh'],
 ['oop', 'oops'],
 ['wan', 'when'],
 ['omaiu', 'oh my i'],
 ['wuz', 'was'],
 ['stuc', 'stuck'],
 ['pleyin', 'playing'],
 ['roblix', 'roblox'],
 ['wif', 'with'],
 ['bebes', 'babies'],
 ['nythin', 'anything'],
 ['cn', 'can'],
 ['be', 'be'],
 ['giga', 'giga'],
 ['comp', 'comp'],
 ['if', 'if'],
 ['u', 'you'],
 ['hit', 'hit'],
 ['stahp', 'stop'],
 ['forcin', 'forcing'],
 ['nd', 'and'],
 ['pley', 'play'],
 ['wot', 'what'],
 ['gam', 'game'],
 ['givs', 'gives'],

## Common Edit Operations

In [2]:
from collections import defaultdict, Counter
from difflib import SequenceMatcher

edits = defaultdict(Counter)
for pi_word, en_word in word_pairs:
    for tag, i1, i2, j1, j2 in SequenceMatcher(None, en_word, pi_word).get_opcodes():
        edits[tag][en_word[i1:i2] + ',' + pi_word[j1:j2]] += 1

# edits = {edit: {k: v for k, v in counts.items() if v > 1} for edit, counts in edits.items()}

for op_name, ops in edits.items():
    if op_name == 'equal':
        continue
    ops_sorted = sorted(ops.items(), key=lambda x: -x[1])
    print(op_name)
    print(ops_sorted)

delete
[('e,', 189), ('a,', 127), ('g,', 116), ('h,', 73), ('yo,', 68), ("',", 37), ('o,', 36), ('t,', 33), ('l,', 26), ('gh,', 24), ("t',", 23), (' ,', 23), ('r,', 19), ('w,', 17), ('h ,', 14), ('y,', 13), ('s,', 11), ('i,', 10), ('ar,', 9), ('th,', 9), ('d,', 9), ("'t,", 8), ('ne,', 8), ('now,', 8), ('c,', 8), ('own,', 7), ('ev,', 6), ('ea,', 6), ('wh,', 5), ('ight ,', 5), ('u,', 4), ('k,', 4), ('av,', 4), ("on't ,", 4), ("'t k,", 4), ('ack,', 4), ('eo,', 3), ('irth,', 3), ('er,', 3), ('ind,', 3), ('orry,', 3), ('ourse,', 3), ('re,', 3), ('ugh,', 3), ('b,', 2), ('shed,', 2), ('ner,', 2), ('ery,', 2), ('g to,', 2), ('eal,', 2), ('t the ,', 2), ('elcome ,', 2), ('ur,', 2), ('d ,', 2), ('hout,', 2), ('f,', 2), ('co,', 2), ('ple,', 2), ('hank ,', 2), ('ou,', 2), ('line,', 2), ('ule,', 2), ('ny,', 2), ('ad,', 2), ('ill ,', 2), ('elf,', 2), ('ably,', 2), ("t's ,", 1), ('ou ,', 1), ('ect,', 1), ('gue,', 1), ('te ,', 1), ('ssi,', 1), ('ecially,', 1), ("'m,", 1), ('ed,', 1), ('initely,', 1), 

In [3]:
import json

with open('../scripts/pi_edits.json', 'w') as f:
    json.dump(edits, f, indent=4)