In [2]:
import sacrebleu
import pandas as pd
from collections import Counter

In [3]:
pred_file = open("data/amnlp21.es-shp.shp.pred", "r")
ref_file = open("data/amnlp21.shp", "r")

sentences = list(zip(pred_file, ref_file))
df = pd.DataFrame(sentences, columns=["pred", "ref"])

df["pred"] = df["pred"].str.lower()
df["ref"] = df["ref"].str.lower()
df["pred"] = df["pred"].str.rstrip(".\n")
df["ref"] = df["ref"].str.rstrip(".\n")

df["chrF"] = df.apply(lambda row: sacrebleu.corpus_chrf([row["pred"]], [[row["ref"]]]).score, axis=1)

df.sort_values(by=["chrF"], inplace=True)

In [4]:
df.head()

Unnamed: 0,pred,ref,chrF
45,oa báque oxnánra áni jáque,ja xontako ea akinti keni iitaira kikin ochoir...,6.334979
285,báquera jáque,ja bakeranonra neno itinke,7.378451
299,ischónra jóni mahuáque,westiora joniresra mawake accidentenko nato ja...,7.492647
833,jonínra jahuen ahuínhaxon yatánxon yatáncanque,westiora jonira liberankana ike nato cargamea ...,7.957585
741,yámiquentira yonáque,janra sugierenke westiora fregona biti,8.136302


In [5]:
df["chrF"].describe()

count    1003.000000
mean       34.535699
std        16.251995
min         6.334979
25%        21.787803
50%        33.163253
75%        44.173054
max       100.000000
Name: chrF, dtype: float64

In [6]:
df = df.loc[df["chrF"] < 100]
q2 = df["chrF"].quantile(0.5)
df = df.loc[df["chrF"] >= q2]

In [7]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,pred,ref,chrF
0,nato u2ninra ícha película biwanke,nato u2ninra kikin icha película bike,66.952458
1,jatora yoyo ikashamai tsoarin ja peinko ixon i...,jatora ikastimakana ike acuerdo iki tsoaborin ...,33.637025
2,"jaskara ikax, en shinannara ea ramona betan yo...","jaskara ikaxra, en shinanai ramona betan yoyo ...",68.390161
3,"jaskara oinxonra en oinai, enra oinai jawerano...","jaskara ikentian en bochiki oinai, ramona jain...",36.724132
4,"janra akai jakon tee akin, jakopira nato ofici...","westiora tee de amor keskra akanai, jakopira j...",49.005757
...,...,...,...
495,"jen jen, jonibora kikin pitioma iki","jen jen, jonibo jaweska ikax jawekiakashamaibo",36.175044
496,kikin jakon riki campónra mia masa shinantima ...,ja kikin jakon riki nato camponko ixon mia ja ...,44.947066
497,"cuba markanxbichora ike, eh, kaiser, eh, ja im...",ja baja bichora ike nato crisis de cubankonia ...,47.240933
498,"eh, jara ika iki st. louis, jefferson city ita...","eh, st. louis pikotainra jakatiai, jefferson c...",61.825328


In [9]:
sentences = df["ref"].tolist()
corpus = ' '.join(sentences)
words = [w.strip() for w in corpus.split()]
word_count = Counter(words)
unique_words = list(word_count.keys())

print("Número de palabras únicas (oraciones sin errores ortográficos): {}".format(len(unique_words)))

Número de palabras únicas (oraciones sin errores ortográficos): 1735


In [8]:
sentences = df["pred"].tolist()
corpus = ' '.join(sentences)
words = [w.strip() for w in corpus.split()]
word_count = Counter(words)
unique_words = list(word_count.keys())

print("Número de palabras únicas (oraciones con errores ortográficos): {}".format(len(unique_words)))

Número de palabras únicas (oraciones con errores ortográficos): 1267


In [7]:
with open("shi.translationese.test.sentences.errors.txt", "w") as f:
    f.write(df["pred"].str.cat(sep='\n'))

with open("shi.translationese.test.sentences.txt", "w") as f:
    f.write(df["ref"].str.cat(sep='\n'))

In [8]:
print("test chrF: {}".format(df["chrF"].mean()))

test chrF: 46.84141931899608
