In [None]:
## process files
import glob
import pprint
target_dir = '3k/mgv9-sk'
files = glob.glob(f"results/{target_dir}/*")
verbose = False
if verbose:
    pprint.pprint(files)

## filter files
sample_n = 3000
import re
data_files = [ file for file in files if re.match(f".*result-.*{sample_n}.*.txt", file) ]
pprint.pprint(data_files)

In [316]:
## extract values
import io, re
method_names = [ 'DT', 'RF', 'NN', 'MM' ]
attr_names   = [ 'gender', 'plurality', 'case']
trueness     = [ 'True]', 'False]' ]
lang_names   = [ 'Czech', 'French', 'German', 'Irish' ]
term_names   = [ f"{x}{y}gram{z}" for x in [ '', 'skippy', 'xskippy' ]
                                    for y in range(2,5)
                                    for z in ['-no-hash', '-hash-at-both'] ]
print(f"term_names: {term_names}")
registered_vals = method_names + lang_names + attr_names + trueness + term_names
print(f"registered_vales: {registered_vals}")

['results/3k/mgv9-sk/result-French-sample3000-mgv9-2024-12-27-22-40.txt',
 'results/3k/mgv9-sk/result-Czech-sample3000-mgv9-2024-12-27-21-29.txt',
 'results/3k/mgv9-sk/result-Irish-sample3000-mgv9-2024-12-28-8-50.txt',
 'results/3k/mgv9-sk/result-German-sample3000-mgv9-2024-12-28-2-17.txt']
term_names: ['2gram-no-hash', '2gram-hash-at-both', '3gram-no-hash', '3gram-hash-at-both', '4gram-no-hash', '4gram-hash-at-both', 'skippy2gram-no-hash', 'skippy2gram-hash-at-both', 'skippy3gram-no-hash', 'skippy3gram-hash-at-both', 'skippy4gram-no-hash', 'skippy4gram-hash-at-both', 'xskippy2gram-no-hash', 'xskippy2gram-hash-at-both', 'xskippy3gram-no-hash', 'xskippy3gram-hash-at-both', 'xskippy4gram-no-hash', 'xskippy4gram-hash-at-both']
registered_vales: ['DT', 'RF', 'NN', 'MM', 'Czech', 'French', 'German', 'Irish', 'gender', 'plurality', 'case', 'True]', 'False]', '2gram-no-hash', '2gram-hash-at-both', '3gram-no-hash', '3gram-hash-at-both', '4gram-no-hash', '4gram-hash-at-both', 'skippy2gram-no-ha

In [314]:
def extract_data (lines, check: bool = False):
    import re
    lines = [ re.split(r"\s\s+", line.strip()) for line in lines if len(line) > 1 ]
    d = []
    for i, line in enumerate(lines):
        if line[0] in [ 'accuracy' ]:
            if check:
                print(f"target_value on: {line}")
            target_accuracy = line[1]
            try:
                candidate_line = lines[i-5]
                if check:
                    print(f"candidate_line1: {candidate_line}")
                if "classification" in candidate_line[0]:
                    method_line = candidate_line
                    if check:
                        print(f"method_line: {method_line}")
                    d.append((method_line, target_accuracy))
                else:
                    candidate_line = lines[i-4]
                    if check:
                        print(f"candidate_line2: {candidate_line}")
                    if "classification" in candidate_line[0]:
                        method_line = candidate_line
                        if check:
                            print(f"method_line: {method_line}")
                        d.append((method_line, target_accuracy))
                    else:
                        candidate_line = lines[i-3]
                        if check:
                            print(f"candidate_line3: {candidate_line}")
                        if "classification" in candidate_line[0]:
                            method_line = candidate_line
                            if check:
                                print(f"method_line: {method_line}")
                            d.append((method_line, target_accuracy))
                if check:
                    print(f"method_line: {method_line}")
            except ValueError:
                pass
    if check:
        print(f"d: {d}")
    return d

In [315]:
def extract_values (records, registered: list, check: bool = False):
    import re
    R = []
    for record in records:
        if check:
            print(f"record: {record}")
        method, accuracy = record
        F = [ field for field in re.split(r"\s+", method[0]) if field in registered ]
        F = [ 'NN' if x == 'MM' else x for x in F ]
        F = [ 'True' if x == 'True]' else x for x in F ]
        F = [ 'False' if x == 'False]' else x for x in F ]
        F.append(accuracy)
        R.append(F)
    return R

In [320]:
## extract values and generate output files
import pandas as pd
check = True
dfs = []
for file in data_files:
    fn_fields = re.split(r"-", file)
    lang_setting, sample, term_setting, mgv, year, month, day, hour, minute = \
        fn_fields[1], fn_fields[2], fn_fields[3], fn_fields[4], fn_fields[5], fn_fields[6], fn_fields[7], fn_fields[8], fn_fields[9]
    if check:
        print(f"opening {file}")
    with io.open(file, encoding = 'utf-8_sig') as f:
        lines = [ line.strip() for line in f.readlines() if len(line) > 0 ]
        if not len(lines) > 1:
            continue
        records = extract_values (extract_data (lines), registered_vals)
    if check:
        print(records)
    ##
    records_rev = []
    for i, record in enumerate(records):
        print(f"record {i:03d}: {record} [type: {type(record)}]")
        if len(record) < 6:
            print(f"found invalid record")
            print(record)
            lang_present = any(map(lambda x: x in record, lang_names))
            print(f"lang_present: {lang_present}")
            if not lang_present:
                record.insert(3, last_lang_name)
            term_present = any(map(lambda x: x in record, term_names))
            print(f"term_present: {term_present}")
            if not term_present:
                record.insert(4, "missing")
            print(f"modified {i}: {record} [type: {type(record)}]")
            records_rev.append(record)
        else:
            records_rev.append(record)
            last_lang_name = [ x for x in record if x in lang_names ][0]
    ##
    df = pd.DataFrame(columns = [ 'lang', 'method', 'attribute', 'supplement', 'term_type', 'accuracy'])
    for i, record in enumerate(records_rev):
        print(f"record {i}: {record}")
        method, attr, supplement, lang, term, accuracy = record
        df.loc[i] = [lang, attr, supplement, method, term, accuracy]
    dfs.append(((term_setting, sample, lang, mgv, month, day, hour, minute), df))

opening results/3k/mgv9-sk/result-French-sample3000-mgv9-2024-12-27-22-40.txt
[['DT', 'gender', 'True', 'French', '2gram-no-hash', '0.74'], ['RF', 'gender', 'True', '0.83'], ['NN', 'gender', 'True', 'French', '2gram-no-hash', '0.77'], ['DT', 'gender', 'False', 'French', '2gram-no-hash', '0.75'], ['RF', 'gender', 'False', '0.83'], ['NN', 'gender', 'False', 'French', '2gram-no-hash', '0.78'], ['DT', 'plurality', 'True', 'French', '2gram-no-hash', '0.90'], ['RF', 'plurality', 'True', '0.91'], ['NN', 'plurality', 'True', 'French', '2gram-no-hash', '0.92'], ['DT', 'plurality', 'False', 'French', '2gram-no-hash', '0.93'], ['RF', 'plurality', 'False', '0.93'], ['NN', 'plurality', 'False', 'French', '2gram-no-hash', '0.94'], ['DT', 'gender', 'True', 'French', '3gram-no-hash', '0.73'], ['RF', 'gender', 'True', '0.74'], ['NN', 'gender', 'True', 'French', '3gram-no-hash', '0.77'], ['DT', 'gender', 'False', 'French', '3gram-no-hash', '0.74'], ['RF', 'gender', 'False', '0.79'], ['NN', 'gender', 'Fa

In [321]:
dfs

[(('sample3000', 'French', 'French', 'mgv9', '12', '27', '22', '40.txt'),
         lang     method attribute supplement                 term_type accuracy
  0    French     gender      True         DT             2gram-no-hash     0.74
  1    French     gender      True         RF                   missing     0.83
  2    French     gender      True         NN             2gram-no-hash     0.77
  3    French     gender     False         DT             2gram-no-hash     0.75
  4    French     gender     False         RF                   missing     0.83
  ..      ...        ...       ...        ...                       ...      ...
  139  French  plurality      True         RF                   missing     0.93
  140  French  plurality      True         NN  skippy4gram-hash-at-both     0.94
  141  French  plurality     False         DT  skippy4gram-hash-at-both     0.90
  142  French  plurality     False         RF                   missing     0.93
  143  French  plurality     False 

In [322]:
## output
for d in dfs:
    key, df = d[0], d[1]
    #print(f"key: {key}")
    output_fname = f"results/{target_dir}/{'-'.join(key)}.csv"
    print(f"output_name: {output_fname}")
    df.to_csv(output_fname, index = False)

key: ('sample3000', 'French', 'French', 'mgv9', '12', '27', '22', '40.txt')
output_name: results/3k/mgv9-sk/sample3000-French-French-mgv9-12-27-22-40.txt.csv
key: ('sample3000', 'Czech', 'Czech', 'mgv9', '12', '27', '21', '29.txt')
output_name: results/3k/mgv9-sk/sample3000-Czech-Czech-mgv9-12-27-21-29.txt.csv
key: ('sample3000', 'Irish', 'Irish', 'mgv9', '12', '28', '8', '50.txt')
output_name: results/3k/mgv9-sk/sample3000-Irish-Irish-mgv9-12-28-8-50.txt.csv
key: ('sample3000', 'German', 'German', 'mgv9', '12', '28', '2', '17.txt')
output_name: results/3k/mgv9-sk/sample3000-German-German-mgv9-12-28-2-17.txt.csv
