In [1]:
import json
import pandas as pd

def load_json(path: str):
    with open(path, encoding="utf-8") as fh:
        return json.load(fh)

def load_csv(path: str):
    return pd.read_csv(path)

DATA = load_json("data/merged_output_20250518_1257.json")
CSV  = load_csv("data/disambiguated_collocations.csv")

In [2]:
print(CSV[CSV["total num of error occur"]>=10])
print(len(CSV[CSV["total num of error occur"]>=10]))
print(len(CSV))

        error   pos correction       error phrase      correct phrase  \
0        make  VERB         do       make cooking          do cooking   
1        make  VERB         do        make lesson           do lesson   
2        make  VERB         do    make internship       do internship   
3        make  VERB         do      make american         do american   
4        make  VERB         do       make packing          do packing   
...       ...   ...        ...                ...                 ...   
37969   check  VERB  checklist        check usual     checklist usual   
37970   check  VERB       book       check ticket         book ticket   
37971   check  VERB         do     check exercise         do exercise   
37972   check  VERB     verify  check information  verify information   
37973  bother  VERB       make      bother effort         make effort   

       num of this error-correction phrase occur  \
0                                             10   
1                  

In [3]:
def get_error_options(CSV):
    options = []
    for i in CSV["error"].unique().tolist():
        rows = CSV[CSV["error"] == i]
        if rows.empty:
            continue  # Skip if no rows found
        total = rows["total num of error occur"].iloc[0]
        pos = rows["pos"].iloc[0]
        if pos in {"NOUN", "VERB", "ADJ", "ADV"} and total >= 10:
            options.append(f"{i}, {total}, {pos}")
    return options
df = get_error_options(CSV)

In [4]:
# sort the options by the number of errors
df.sort(key=lambda x: int(x.split(", ")[1]), reverse=True)
print("Error options with at least 10 occurrences:")
for option in df:
    print(option)

Error options with at least 10 occurrences:
make, 4016, VERB
get, 3029, VERB
have, 2936, VERB
very, 2791, ADV
take, 2016, VERB
person, 1888, NOUN
do, 1814, VERB
good, 1489, ADJ
know, 1380, VERB
come, 1330, VERB
much, 1235, ADV
go, 1193, VERB
wish, 1139, VERB
join, 1077, VERB
too, 915, ADV
travel, 891, NOUN
think, 867, VERB
say, 838, VERB
meet, 790, VERB
become, 786, VERB
spend, 768, VERB
well, 744, ADV
enter, 727, VERB
tell, 725, VERB
bring, 699, VERB
trouble, 657, NOUN
watch, 641, VERB
dairy, 635, NOUN
give, 633, VERB
use, 559, VERB
time, 551, NOUN
see, 545, VERB
people, 544, NOUN
important, 517, ADJ
many, 516, ADJ
big, 500, ADJ
live, 494, VERB
way, 488, NOUN
here, 479, ADV
learn, 473, VERB
work, 469, NOUN
find, 463, VERB
so, 454, ADV
talk, 446, VERB
lean, 426, VERB
mail, 416, NOUN
image, 412, VERB
want, 407, VERB
more, 402, ADV
improve, 401, VERB
let, 397, VERB
play, 375, VERB
look, 371, VERB
job, 356, NOUN
fall, 345, VERB
less, 335, ADV
cloth, 328, NOUN
little, 323, ADJ
thing, 312, 

In [5]:
# save df to csv
# split the string by ", " and convert to list
df = [i.split(", ") for i in df]
# convert to dataframe
df = pd.DataFrame(df, columns=["error", "total num of error occur", "pos"])
df.to_csv("data/error_options.csv", index=False)