In [1]:
import numpy as np
import pandas as pd
import re
import glob

In [2]:
glob.glob('./Second pass model outputs/*.txt')

['./Second pass model outputs\\atlantic_crossword_test.txt',
 './Second pass model outputs\\ByT5 Console Output.txt',
 './Second pass model outputs\\byt5 inference clean data.txt',
 './Second pass model outputs\\byt5 inference data.txt',
 './Second pass model outputs\\byt5 multi iteration data.txt',
 './Second pass model outputs\\byt5_multi iteration data new dataset.txt',
 './Second pass model outputs\\newsday_crossword_test.txt',
 './Second pass model outputs\\nyt_crossword_test.txt',
 './Second pass model outputs\\T5 Console Output.txt',
 './Second pass model outputs\\t5 full dateaset word segment.txt',
 './Second pass model outputs\\t5 inference data.txt',
 './Second pass model outputs\\t5 multi iteration 70 percentage data trained on.txt',
 './Second pass model outputs\\t5 multi iteration data segmented answer model.txt',
 './Second pass model outputs\\t5 multi iteration data segmented answer model_2.txt',
 './Second pass model outputs\\t5 multi iteration data word segmented.txt',

In [29]:
def extract_float(input_string):
    pattern = r"\d+\.\d+"
    matches = re.findall(pattern, input_string)
    float_numbers = [float(match) for match in matches]
    return float_numbers

def extract_data(lines):
    date_pattern = r"\b\d{2}/\d{2}/\d{4}\b"
    data_dict = {}
    new_date_math = False
    current_date = ''
    error_dates = []

    for line in lines:
        if 'error' in line:
            match = re.search(date_pattern, line)
            error_dates.append(match.group())
    
    for line in lines:
        match = re.match(date_pattern, line) # to match the date pattern
        if match:
            current_date = line.strip()
            data_dict[current_date] = {}
            data_dict[current_date]['Letter II'] = []
            data_dict[current_date]['Word II'] = []

        if 'Before' in line:
            [lett_accu, word_accu] = extract_float(line)
            data_dict[current_date]['Before Letter Accuracy'] = lett_accu
            data_dict[current_date]['Before Word Accuracy'] = word_accu
            
        if 'Total time taken for t5-small' in line:
            s_pass_time_taken = float(line.replace('Total time taken for t5-small: ', '').strip('\n').replace(' seconds', ''))
            data_dict[current_date]['Second Pass Time Taken'] = s_pass_time_taken
            
        if 'Total times the second pass model is called:' in line:
            s_pass_count = float(line.replace("Total times the second pass model is called: ", '').strip('\n'))
            data_dict[current_date]['Second Pass Model Call Count'] = s_pass_count

        if 'iteration:' in line:
            lett_accu, word_accu = extract_float(line)
            data_dict[current_date]['Letter II'].append(lett_accu)
            data_dict[current_date]['Word II'].append(word_accu)
            
    return data_dict

In [32]:
def get_df_data(data_dict, threshold = 85.0):
    output_data = []
    for date, inf_data in data_dict.items():

        if len(inf_data.keys()) < 4:
            continue

        # only first pass model output exists
        if len(inf_data['Letter II']) == 0:
            f_pass_l_accu = round(inf_data['Before Letter Accuracy'], 2)
            f_pass_w_accu = round(inf_data['Before Word Accuracy'], 2)

            s_pass_l_accu = f_pass_l_accu
            s_pass_w_accu = f_pass_w_accu
            output_data.append((date, f_pass_l_accu, f_pass_w_accu, s_pass_l_accu, s_pass_w_accu))
        else:
            f_pass_l_accu = round(inf_data['Before Letter Accuracy'], 2)
            f_pass_w_accu = round(inf_data['Before Word Accuracy'], 2)

            s_pass_l_list = inf_data['Letter II']
            s_pass_w_list = inf_data['Word II']
            max_accu_index = s_pass_l_list.index(max(s_pass_l_list))

            s_pass_l_accu = round(s_pass_l_list[max_accu_index], 2)
            s_pass_w_accu = round(s_pass_w_list[max_accu_index], 2)

            if s_pass_l_accu < f_pass_l_accu:
                s_pass_l_accu = f_pass_l_accu
                s_pass_w_accu = f_pass_w_accu

            output_data.append((date, f_pass_l_accu, f_pass_w_accu, s_pass_l_accu, s_pass_w_accu, inf_data['Second Pass Time Taken'], inf_data['Second Pass Model Call Count']))
            
    df = pd.DataFrame(output_data, columns = ['Date', 'First Pass Letter Accuracy', 'First Pass Word Accuracy', 'Second Pass Letter Accuracy', 'Second Pass Word Accuracy', 'S_Pass Time Taken', 'S_Pass Call Count'])
    return df

#### T5_small and ByT5_small test

In [40]:
byt5_data_path = "./Second pass model outputs\\byt5_multi iteration data new dataset.txt"
lines = open(byt5_data_path, 'r').readlines()
data_dict = extract_data(lines)
df_ = get_df_data(data_dict)
df_.to_csv("./Second pass model outputs/byt5_small hard grids - with time data.csv")

In [41]:
t5_data_path = "./Second pass model outputs\\t5_multi iteration data on new dataset.txt"
lines = open(t5_data_path, 'r').readlines()
data_dict = extract_data(lines)
df_ = get_df_data(data_dict)
df_.to_csv("./Second pass model outputs/t5_small hard grids - with time data.csv")

In [43]:
hard_path = "./Second pass model outputs/t5_small new dataset trained - tested on hard solutions.txt"
lines = open(hard_path, 'r').readlines()
data_dict = extract_data(lines)
df_ = get_df_data(data_dict)
# df_.to_csv("./Second pass model outputs/hard solution with new t5_small.csv")
df_.describe()

In [46]:
path = "./Second pass model outputs/t5 multi iteration wod segmented 2 epochs.txt"
lines = open(path, 'r').readlines()
data_dict = extract_data(lines)
df_ = get_df_data(data_dict)
df_.describe()

Unnamed: 0,First Pass Letter Accuracy,First Pass Word Accuracy,Second Pass Letter Accuracy,Second Pass Word Accuracy
count,24.0,24.0,24.0,24.0
mean,97.294167,90.094583,99.15625,96.72125
std,1.854412,5.981114,1.303533,4.602359
min,92.39,77.14,95.63,84.62
25%,96.15,85.4625,98.9125,95.41
50%,97.84,90.675,99.745,98.61
75%,98.3675,92.67,100.0,100.0
max,100.0,100.0,100.0,100.0
