# Get Fonts GT

In [4]:
import pandas as pd
import json
dfs = []
for phase in ['Phase1', 'Phase2']:
    df = pd.read_excel(f'../../data/fonts/{phase}_Scoresheet.xlsx', )
    df['a'] = df['Condition'].apply(lambda x: '1.00' if x == 'Normal' else '0.125')
    df['b'] = '1.00'
    df['Filter_no'] = df['Condition'].apply(lambda x: 1 if x == 'Normal' else 32)
    df['group_id'] = df['Condition'].apply(lambda x: 0 if x == 'Normal' else 2)
    # Create a group key
    df['combo_count'] = df.groupby(['Font', 'Condition']).cumcount() + 1
    df['group_id'] = df['group_id'] + 1 + ((df['combo_count']-1)  // 13).astype(int)
    df['filename'] = f'{phase}_Img/' + phase +'_' + df['Font'] + '_' + df['Condition'] + '_' + df['group_id'].astype(str) + '_' + df['PrintSize'].round(1).astype(str) + '.png'
    dfs.append(df.copy())
pd.concat(dfs).to_csv('../../data/summary/fonts_compare.csv', index=False)

In [5]:
import pandas as pd
import json

def dataframe_to_coco(df):
    """
    Convert a DataFrame to COCO captioning format.
    
    Assumptions:
      - The image path is constructed by concatenating the values from the "a" and "row" columns.
      - The caption is built by concatenating the text from columns L1 to L15 (ignoring NaNs).
    """
    coco = {
        "images": [],
        "annotations": [],
        "info": {
            "description": "MNRead Dataset",
            "version": "1.0"
        },
        "licenses": []
    }
    
    image_id = 0
    annotation_id = 0
    
    # Iterate through each row in the dataframe
    for _, row in df.iterrows():
        # Construct the image file name. Here we assume column 'a' is the image number
        # and 'row' is another identifier. Adjust formatting as needed.
        file_name = row.get('filename')
        caption = row.get('Sentence')
        
        # Add image entry (you may add width, height if known)
        coco["images"].append({
            "id": image_id if 'index' not in row else row['index'],
            "file_name": file_name,
            "Filter_no": row['Filter_no']
        })
        
        # Add annotation entry for the caption
        coco["annotations"].append({
            "id": annotation_id if 'index' not in row else row['index'],
            "image_id": image_id if 'index' not in row else row['index'],
            "caption": caption
        })
        
        image_id += 1
        annotation_id += 1
        
    return coco



In [7]:
coco_dict = dataframe_to_coco(pd.concat(dfs))
    
# Save the COCO JSON to a file
with open("../../data/fonts/anno.json", "w") as f:
    json.dump(coco_dict, f, indent=4)

# Get Chart Ground Truth

In [4]:
import numpy as np
from tqdm import tqdm
import glob
import re
import json
from typing import Dict, List, Optional, Tuple, Union
import os
import inspect
from copy import deepcopy

import pandas as pd

In [5]:
def get_gt(file_path:str, word_num:int, 
           rename = {'Image.no':'image_no','print size':'print_size'},
           L_R:bool = False, human=False) -> pd.DataFrame:
    # Specify the path to your Excel file
    
    # Read the Excel file into a pandas DataFrame
    xls = pd.ExcelFile(file_path)
    # 获取所有sheet的名称
    sheet_names = xls.sheet_names

    # 逐个读取sheet，并存入列表
    dfs = [pd.read_excel(xls, sheet_name=sheet) for sheet in sheet_names]

    # 合并所有的DataFrame
    gt = pd.concat(dfs, ignore_index=True)
    
    # df = pd.read_excel(file_path, sheet_name=None, header=0)
    
    cols = ['SubID','print_size','image_no','a','b'] + ['L{}'.format(num) for num in range(1,word_num+1)]
    # if human:
    #     cols = ['SubID'] + cols
    
    if L_R:
        cols += ['R{}'.format(num) for num in range(1,word_num+1)]

    # gt = pd.concat(df,axis=0,ignore_index=True)
    if rename is not None:
        gt.rename(columns={'Image.no':'image_no','print size':'print_size'}, inplace=True)
    gt['print_size'] = np.round(gt['print_size'],decimals=1)
    gt['row'] = gt.groupby(['image_no','SubID']).apply(lambda x: np.round((x['print_size'].max()-x['print_size']+0.1)*10)).reset_index().set_index('level_2')['print_size']
    
    cols = ['row'] + cols
    gt = gt.loc[:,cols].set_index(['print_size','image_no'])

    
    if human:
        return gt


    # 找到重复的索引
    duplicate_index = gt.index.duplicated()

    # 过滤出不重复的行
    df_no_duplicates = gt[~duplicate_index]
    return df_no_duplicates

In [6]:
import pandas as pd
import json

def dataframe_to_coco(df):
    """
    Convert a DataFrame to COCO captioning format.
    
    Assumptions:
      - The image path is constructed by concatenating the values from the "a" and "row" columns.
      - The caption is built by concatenating the text from columns L1 to L15 (ignoring NaNs).
    """
    coco = {
        "images": [],
        "annotations": [],
        "info": {
            "description": "MNRead Dataset",
            "version": "1.0"
        },
        "licenses": []
    }
    
    image_id = 0
    annotation_id = 0
    word_num = max([int(i[1:]) for i in df.columns if 'L' in i])

    # Iterate through each row in the dataframe
    for _, row in df.iterrows():
        # Construct the image file name. Here we assume column 'a' is the image number
        # and 'row' is another identifier. Adjust formatting as needed.
        file_name = f"{row['image_no']}_2260_{int(row['row'])}.jpg"
        
        # Build caption by concatenating L1 to L15 (ignoring NaN values)
        caption_parts = []
        for i in range(1, word_num+1):
            col_name = f"L{i}"
            token = row.get(col_name)
            if pd.notnull(token):
                caption_parts.append(str(token))
        caption = " ".join(caption_parts)
        
        # Add image entry (you may add width, height if known)
        coco["images"].append({
            "id": image_id if 'index' not in row else row['index'],
            "file_name": file_name,
            "Filter_no": row['Filter_no']
        })
        
        # Add annotation entry for the caption
        coco["annotations"].append({
            "id": annotation_id if 'index' not in row else row['index'],
            "image_id": image_id if 'index' not in row else row['index'],
            "caption": caption
        })
        
        image_id += 1
        annotation_id += 1
        
    return coco



In [8]:
filter_df = pd.read_csv('../../data/human/SelectedFilter.csv')

xlsx_file_path = '../../data/human/MNREADChartScoreSheet.xlsx'
WORD_NUM = 15
 
df_no_duplicates = get_gt(file_path=xlsx_file_path, word_num=WORD_NUM)
df_no_duplicates.drop(columns=['SubID'], inplace=True)
df_no_duplicates['a'] = df_no_duplicates['a'].round(3)
df_no_duplicates['b'] = df_no_duplicates['b'].round(3)
df_no_duplicates['a'].replace(0.156, 0.157, inplace=True)
df_no_duplicates['b'].replace(0.156, 0.157, inplace=True)
df_no_duplicates['a'].replace(0.287, 0.288, inplace=True)
df_no_duplicates['b'].replace(0.287, 0.288, inplace=True)
df_no_duplicates.reset_index(inplace=True)
df_no_duplicates = pd.merge(left=df_no_duplicates,right=filter_df[['a','b','Filter_no']],how='inner',on=['a','b'])

  gt['row'] = gt.groupby(['image_no','SubID']).apply(lambda x: np.round((x['print_size'].max()-x['print_size']+0.1)*10)).reset_index().set_index('level_2')['print_size']
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_no_duplicates['a'].replace(0.156, 0.157, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_n

In [78]:
coco_dict = dataframe_to_coco(df_no_duplicates)
    
# Save the COCO JSON to a file
with open("../../data/mnread/anno.json", "w") as f:
    json.dump(coco_dict, f, indent=4)
    

# TotalText Anno

In [1]:
import pandas as pd
import json

def dataframe_to_coco_tt(df):
    """
    Convert a DataFrame to COCO captioning format.
    
    Assumptions:
      - The image path is constructed by concatenating the values from the "a" and "row" columns.
      - The caption is built by concatenating the text from columns L1 to L15 (ignoring NaNs).
    """
    coco = {
        "images": [],
        "annotations": [],
        "info": {
            "description": "TotalText Dataset",
            "version": "1.0"
        },
        "licenses": []
    }
    
    image_id = 0
    annotation_id = 0

    # Iterate through each row in the dataframe
    for _, row in df.iterrows():
        # Construct the image file name. Here we assume column 'a' is the image number
        # and 'row' is another identifier. Adjust formatting as needed.
        file_name = f"{row['image_no']}"
        
        # Add image entry (you may add width, height if known)
        coco["images"].append({
            "id": image_id if 'index' not in row else row['index'],
            "file_name": file_name,
            "Filter_no": row['Filter_no']
        })
        
        # Add annotation entry for the caption
        coco["annotations"].append({
            "id": annotation_id if 'index' not in row else row['index'],
            "image_id": image_id if 'index' not in row else row['index'],
            "caption": row['caption'],
            "bbox": row['bbox'],
            "ornt": row['ornt'] if 'ornt' in row else []
        })
        
        image_id += 1
        annotation_id += 1
        
    return coco



In [2]:
import glob
import json
gts = glob.glob('../../data/viocr/totaltext/gt/*.json')

In [3]:
from builtins import enumerate, list, open
import re
import glob
import os
files = glob.glob("/cis/home/qgao14/my_documents/VIOCR_infer_models/data/viocr/totaltext/txt_format/Test/*.txt")
df = {'image_no':[],'Filter_no':[],'a':[],'b':[],'label':[],'points':[],'ornt':[]}
filters=[1,2,3,4,5,6,7,32,33,34,35,36,38,39,40,41]
HShiftList = [1.000, 0.288, 0.157, 0.086, 0.048, 0.027,
    0.250, 0.134, 0.072, 0.039, 0.022,
    0.267, 0.144, 0.078, 0.043, 0.024,
    0.314, 0.172, 0.096, 0.055, 0.032,
    0.345, 0.193, 0.110, 0.064, 0.038,
    0.439, 0.256, 0.154, 0.033, 0.018,
    0.125, 0.063, 0.031, 0.016, 1.000,
    1.000, 1.000, 1.000, 1.000, 1.000]

VShiftList = [1.000, 0.288, 0.157, 0.086, 0.048, 0.027,
    1.000, 0.534, 0.288, 0.157, 0.086,
    0.534, 0.288, 0.157, 0.086, 0.048,
    0.157, 0.086, 0.048, 0.027, 0.016,
    0.086, 0.048, 0.027, 0.016, 0.010,
    0.027, 0.016, 0.010, 0.534, 0.288,
    1.000, 1.000, 1.000, 1.000, 0.355,
    0.178, 0.089, 0.045, 0.022, 0.011]
for filter_no in filters:
    for id, file in enumerate(files):
        with open(file, "r", encoding="utf-8") as f:
            text = f.read()

        # 删除换行符，避免数组分行导致正则失败
        text = text.replace("\n", " ")

        # 匹配每一条完整的记录
        pattern = re.compile(
            r"x:\s*\[\[([-\d\s]+)\]\],\s*"
            r"y:\s*\[\[([-\d\s]+)\]\],\s*"
            r"ornt:\s*\[u'(.+?)'\],\s*"
            r"transcriptions:\s*\[u'(.+?)'\]"
        )

        data = []
        for x_str, y_str, ornt, transcription in pattern.findall(text):
            x = list(map(int, x_str.split()))
            y = list(map(int, y_str.split()))
            data.append({
                "x": x,
                "y": y,
                "ornt": ornt,
                "transcription": transcription
            })
        for item in data:
            try:
                points = list(zip(item['x'], item['y']))
                points = [[list(p) for p in points]]
            
                #df['image_no'].append(f"{id:07d}.jpg")
                df['image_no'].append(f"{os.path.basename(file).split('_')[-1].replace('.txt','')}.jpg")
                df['Filter_no'].append(filter_no)
                df['a'].append(HShiftList[filter_no-1])
                df['b'].append(VShiftList[filter_no-1])
                df['label'].append(item['transcription'])
                df['points'].append(points)
                df['ornt'].append(item['ornt'])
            except Exception as e:
                print(f"Error processing line in file {file}: {item}. Error: {e}")
                continue



    

In [None]:
# df = {'image_no':[],'Filter_no':[],'a':[],'b':[],'label':[],'points':[]}
# filters=[1,2,3,4,5,6,7,32,33,34,35,36,38,39,40,41]
# HShiftList = [1.000, 0.288, 0.157, 0.086, 0.048, 0.027,
#     0.250, 0.134, 0.072, 0.039, 0.022,
#     0.267, 0.144, 0.078, 0.043, 0.024,
#     0.314, 0.172, 0.096, 0.055, 0.032,
#     0.345, 0.193, 0.110, 0.064, 0.038,
#     0.439, 0.256, 0.154, 0.033, 0.018,
#     0.125, 0.063, 0.031, 0.016, 1.000,
#     1.000, 1.000, 1.000, 1.000, 1.000]

# VShiftList = [1.000, 0.288, 0.157, 0.086, 0.048, 0.027,
#     1.000, 0.534, 0.288, 0.157, 0.086,
#     0.534, 0.288, 0.157, 0.086, 0.048,
#     0.157, 0.086, 0.048, 0.027, 0.016,
#     0.086, 0.048, 0.027, 0.016, 0.010,
#     0.027, 0.016, 0.010, 0.534, 0.288,
#     1.000, 1.000, 1.000, 1.000, 0.355,
#     0.178, 0.089, 0.045, 0.022, 0.011]
# # with open('../../data/totaltext/gt/0000002.json','r') as f:
# #     gt = json.load(f)
# for filter_no in filters:
#     for gt_path in gts:
#         with open(gt_path,'r') as f:
#             gt = json.load(f)
#         for shape in gt['shapes']:   
#             df['image_no'].append(gt['imagePath'].split('/')[-1])
#             df['Filter_no'].append(filter_no)
#             df['a'].append(HShiftList[filter_no-1])
#             df['b'].append(VShiftList[filter_no-1])
#             df['label'].append(shape['label'])
#             df['points'].append(shape['points'])
            

In [5]:
import pandas as pd
df = pd.DataFrame(df)
df = df.groupby(['image_no','Filter_no','a','b']).apply(lambda x: [' '.join(x['label']), [i for i in x['points']], [' '.join(x['ornt'])]]).apply(pd.Series).reset_index()
df.rename(columns={0:'caption',1:'bbox',2:'ornt'},inplace=True)
filter_df = pd.read_csv('../../data/human/SelectedFilter.csv')
df = pd.merge(left=df,right=filter_df,how='inner',on=['a','b'])
df.rename(columns={'Filter_no_x':'Filter_no_old','Filter_no_y':'Filter_no'},inplace=True)
df['image_no'] = df.apply(lambda x: f"{x['Filter_no']}/{x['image_no']}",axis=1)

  df = df.groupby(['image_no','Filter_no','a','b']).apply(lambda x: [' '.join(x['label']), [i for i in x['points']], [' '.join(x['ornt'])]]).apply(pd.Series).reset_index()


In [7]:

coco_dict = dataframe_to_coco_tt(df)
    
# Save the COCO JSON to a file
# with open("../../data/totaltext/anno.json", "w") as f:
with open("../../data/totaltext_all/anno.json", "w") as f:
    json.dump(coco_dict, f, indent=4)

In [1]:
{1: 16, 2: 10, 3: 8, 4: 6, 5: 4, 6: 2, 7: 9, 
                        32: 7, 33: 5, 34: 3, 35: 1, 36: 15, 38: 14,
                        39: 13, 40: 12, 41: 11}.values()

dict_values([16, 10, 8, 6, 4, 2, 9, 7, 5, 3, 1, 15, 14, 13, 12, 11])

In [None]:
# xlsx_file_path = '../../data/human/LetterChartScoreSheet.xlsx'
# WORD_NUM = 5
 
# df_no_duplicates = get_gt(file_path=xlsx_file_path, word_num=WORD_NUM)
# df_no_duplicates.drop(columns=['SubID'], inplace=True)
# df_no_duplicates['a'] = df_no_duplicates['a'].round(3)
# df_no_duplicates['b'] = df_no_duplicates['b'].round(3)
# df_no_duplicates['a'].replace(0.156, 0.157, inplace=True)
# df_no_duplicates['b'].replace(0.156, 0.157, inplace=True)
# df_no_duplicates['a'].replace(0.287, 0.288, inplace=True)
# df_no_duplicates['b'].replace(0.287, 0.288, inplace=True)
# df_no_duplicates.reset_index(inplace=True)
# df_no_duplicates = pd.merge(left=df_no_duplicates,right=filter_df[['a','b','Filter_no']],how='inner',on=['a','b'])
# coco_dict = dataframe_to_coco(df_no_duplicates)
    
# # Save the COCO JSON to a file
# with open("../../data/etdrs/anno.json", "w") as f:
#     json.dump(coco_dict, f, indent=4)

# Update ETDRS GT
- the forth line of every chart is missing

In [7]:
def replace_rows(group):
    """
    For each group (chart), if the maximum 'row' equals 21,
    then starting from the row where 'row' is >= 4 and for each row (except the last),
    replace its L1-L5 values with the values from the next row.
    """
    # Sort by the 'row' column (if not already sorted)
    group = group.sort_values(by='row').reset_index(drop=True)
    # Check if last row (i.e. max row number) is 21.
    if group['row'].max() == 21:
        # Iterate over rows from the beginning to the second last row.
        for i in range(len(group) - 1):
            # Check if the current row's 'row' value is >= 4.
            if group.loc[i, 'row'] >= 4:
                # Replace L1 to L5 with the values from the next row.
                group.loc[i, ['L1', 'L2', 'L3', 'L4', 'L5']] = group.loc[i+1, ['L1', 'L2', 'L3', 'L4', 'L5']]
    if group.loc[0, ['L1', 'L2', 'L3', 'L4', 'L5']].isna().any():
        for i in range(len(group) - 2):
            # If any of L1-L5 is NaN in the current row
            if group.loc[i, ['L1', 'L2', 'L3', 'L4', 'L5']].isna().any():
                # Replace this row's L1-L5 with the next row's values
                group.loc[i, ['L1', 'L2', 'L3', 'L4', 'L5']] = group.loc[i+2, ['L1', 'L2', 'L3', 'L4', 'L5']]
                group.loc[i+2, ['L1', 'L2', 'L3', 'L4', 'L5']] = np.nan
    return group

In [8]:
filter_df = pd.read_csv('../../data/human/SelectedFilter.csv')
xlsx_file_path = '../../data/human/LetterChartScoreSheet.xlsx'
WORD_NUM = 5
 
df_no_duplicates = get_gt(file_path=xlsx_file_path, word_num=WORD_NUM)
df_no_duplicates.drop(columns=['SubID'], inplace=True)
df_no_duplicates['a'] = df_no_duplicates['a'].round(3)
df_no_duplicates['b'] = df_no_duplicates['b'].round(3)
df_no_duplicates['a'].replace(0.156, 0.157, inplace=True)
df_no_duplicates['b'].replace(0.156, 0.157, inplace=True)
df_no_duplicates['a'].replace(0.287, 0.288, inplace=True)
df_no_duplicates['b'].replace(0.287, 0.288, inplace=True)
df_no_duplicates.reset_index(inplace=True)
df_no_duplicates = pd.merge(left=df_no_duplicates,right=filter_df[['a','b','Filter_no']],how='inner',on=['a','b'])
# Group by 'image_no' (each chart) and apply the replacement function.
df_no_duplicates = df_no_duplicates.reset_index()
df_no_duplicates = df_no_duplicates.groupby('image_no').apply(replace_rows).reset_index(drop=True)
df_no_duplicates = df_no_duplicates[df_no_duplicates['row'] != 21]
df_no_duplicates = df_no_duplicates.set_index('index').sort_index()
# coco_dict = dataframe_to_coco(df_no_duplicates.dropna().reset_index())
    
# Save the COCO JSON to a file
# with open("../../data/etdrs/anno.json", "w") as f:
#     json.dump(coco_dict, f, indent=4)


  gt['row'] = gt.groupby(['image_no','SubID']).apply(lambda x: np.round((x['print_size'].max()-x['print_size']+0.1)*10)).reset_index().set_index('level_2')['print_size']
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_no_duplicates['a'].replace(0.156, 0.157, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_n

# SeeingAI Results

In [1]:
import pandas as pd

excel_file_path = '../../data/summary/SeeingAI.xlsx'
xls = pd.ExcelFile(excel_file_path)
sheet_names = xls.sheet_names
dfs = [pd.read_excel(xls, sheet_name=sheet) for sheet in sheet_names]


In [2]:
from collections import Counter
def count_matches_per_row(row):
    left_side = pd.Series(row[['L1', 'L2', 'L3', 'L4', 'L5']]).str.lower()
    right_side = pd.Series(row[['R1', 'R2', 'R3', 'R4', 'R5']]).astype(str).str.lower()
    
    counter_left = Counter(left_side.dropna()) # dic
    counter_right = Counter(right_side.dropna()) if right_side.dropna().any() else Counter('')
    
    # 计算两侧相同元素的个数，确保每个元素只计算一次
    matches = sum((counter_left & counter_right).values())
    total = sum(counter_left.values())
    
    # char_level
    counter_left_char = Counter(''.join(left_side.dropna().tolist()))
    counter_right_char = Counter(''.join(right_side.dropna().tolist())) if right_side.dropna().any() else Counter('')
    matches_char = sum((counter_left_char & counter_right_char).values())
    total_char = sum(counter_left_char.values())
    return [matches, total - matches, total_char-matches_char]


In [3]:
filter_df = pd.read_csv('../../data/human/SelectedFilter.csv')

In [9]:
etdre_df = dfs[0]
etdre_df['image_no'] = etdre_df['image_no'].fillna(method='ffill')
etdre_df.drop(columns=['Unnamed: 7'], inplace=True)
etdre_df = pd.merge(etdre_df,df_no_duplicates.dropna().reset_index().drop(columns=['L1','L2','L3','L4', 'L5']),how='left',on=['image_no','print_size'])
etdre_df[['match','missing','missing_char']] = etdre_df.apply(lambda row: count_matches_per_row(row), axis=1).apply(pd.Series)
etdre_df['missing_clipped'] = etdre_df['missing'].clip(upper=5)
etdre_df = pd.merge(etdre_df, filter_df, how='left', on=['a','b','Filter_no'])
etdre_df.rename(columns={'image_no':'chart_no'}, inplace=True)
group_sum = etdre_df.groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.2 + x['missing_clipped'].sum()*0.02).reset_index()
group_sum.columns = ['chart_no', 'a', 'b', 'VA', 'CS', 'Cond','acuity']
group_sum = group_sum.groupby(['a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: x['acuity'].mean()).reset_index()
group_sum.columns = ['a', 'b', 'VA', 'CS', 'Cond','SeeingAI']

  etdre_df['image_no'] = etdre_df['image_no'].fillna(method='ffill')
  group_sum = etdre_df.groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.2 + x['missing_clipped'].sum()*0.02).reset_index()
  group_sum = group_sum.groupby(['a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: x['acuity'].mean()).reset_index()


In [58]:
with open('/cis/home/qgao14/my_documents/VIOCR_infer_models/data/etdrs/anno.json','r') as f:
    anno = json.load(f)
anno = {anno['images'][i]['file_name']: anno['images'][i]['id'] for i in range(len(anno['images']))}
etdre_df['image_no'] = etdre_df['chart_no'] + '_2260_' + (etdre_df.groupby('chart_no').cumcount() + 1).astype(str) + '.jpg'
etdre_df['image_no'] = etdre_df['image_no'].apply(lambda x: anno.get(x, None))

out = []
for id, row in etdre_df.dropna(subset=['image_no']).iterrows():
    if row['image_no'] is not None:
        out.append({
            "image_id": int(row['image_no']),
            "category_id": 1,
            "polys": [],
            "rec_texts": row[['R1','R2','R3','R4','R5']].dropna().astype(str).tolist(),
            "rec_score": 0,
            "det_score": 0,
            "filter": int(row['Filter_no'])
        })
with open('/cis/home/qgao14/my_documents/VIOCR_infer_models/filtered/etdrs/seeingai.json','w') as f:
    json.dump(out, f, indent=4)

In [11]:
final = pd.read_csv('../../data/summary/etdrs_combined_1021.csv')
pd.merge(final, group_sum, how='left', on=['a','b','VA','CS','Cond']).to_csv('../../data/summary/etdrs_combined_1021.csv', index=False)

In [45]:
def count_matches_per_row(row):
    left_side = pd.Series(row[['L{}'.format(i) for i in range(1,16)]]).astype(str).str.lower()
    right_side = pd.Series(row[['R{}'.format(i) for i in range(1,16)]]).astype(str).str.lower()
    
    counter_left = Counter(left_side.dropna()) # dic
    counter_right = Counter(right_side.dropna()) if right_side.dropna().any() else Counter('')
    
    # 计算两侧相同元素的个数，确保每个元素只计算一次
    matches = sum((counter_left & counter_right).values())
    total = sum(counter_left.values())
    
    # char_level
    counter_left_char = Counter(''.join(left_side.dropna().tolist()))
    counter_right_char = Counter(''.join(right_side.dropna().tolist())) if right_side.dropna().any() else Counter('')
    matches_char = sum((counter_left_char & counter_right_char).values())
    total_char = sum(counter_left_char.values())
    return [matches, total - matches, total_char-matches_char]

In [48]:
mnread_df = dfs[1].copy()
mnread_df.drop(columns=['a','b'], inplace=True)
mnread_df.rename(columns={'Image.no':'image_no','print size':'print_size'}, inplace=True)
mnread_df = pd.merge(mnread_df,df_no_duplicates[['print_size', 'image_no', 'row', 'a', 'b','Filter_no']],how='left',on=['image_no','print_size'])
mnread_df[['match','missing','missing_char']] = mnread_df.apply(lambda row: count_matches_per_row(row), axis=1).apply(pd.Series)
mnread_df['missing_clipped'] = mnread_df['missing'].clip(upper=10)
mnread_df = pd.merge(mnread_df, filter_df, how='left', on=['a','b','Filter_no'])
mnread_df.rename(columns={'image_no':'chart_no'}, inplace=True)
mnread_df['print_size'] -=0.8
mnread_df = mnread_df.loc[mnread_df['print_size'] >= -0.4, :]
group_sum = mnread_df.groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.4 + x['missing_clipped'].sum()*0.01).reset_index()
group_sum.columns = ['chart_no', 'a', 'b', 'VA', 'CS', 'Cond','acuity']
group_sum = group_sum.groupby(['a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: x['acuity'].mean()).reset_index()
group_sum.columns = ['a', 'b', 'VA', 'CS', 'Cond','SeeingAI']

  group_sum = mnread_df.groupby(['chart_no','a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: -0.4 + x['missing_clipped'].sum()*0.01).reset_index()
  group_sum = group_sum.groupby(['a', 'b', 'VA', 'CS', 'Cond']).apply(lambda x: x['acuity'].mean()).reset_index()


In [11]:
final = pd.read_csv('../../data/summary/mnread_combined_1021.csv')
pd.merge(final, group_sum, how='left', on=['a','b','VA','CS','Cond']).to_csv('../../data/summary/mnread_combined_1021.csv', index=False)

In [57]:
with open('/cis/home/qgao14/my_documents/VIOCR_infer_models/data/mnread/anno.json','r') as f:
    anno = json.load(f)
anno = {anno['images'][i]['file_name']: anno['images'][i]['id'] for i in range(len(anno['images']))}
mnread_df['image_no'] = mnread_df['chart_no'] + '_1680_' + (mnread_df.groupby('chart_no').cumcount() + 1).astype(str) + '.png'
mnread_df['image_no'] = mnread_df['image_no'].apply(lambda x: anno.get(x, None))

out = []
for id, row in mnread_df.dropna(subset=['Filter_no']).iterrows():
    if row['image_no'] is not None:
        out.append({
            "image_id": int(row['image_no']),
            "category_id": 1,
            "polys": [],
            "rec_texts": row[['R1','R2','R3','R4','R5','R6','R7','R8','R9','R10','R11','R12','R13','R14','R15']].dropna().astype(str).tolist(),
            "rec_score": 0,
            "det_score": 0,
            "filter": int(row['Filter_no'])
        })
with open('/cis/home/qgao14/my_documents/VIOCR_infer_models/filtered/mnread/seeingai.json','w') as f:
    json.dump(out, f, indent=4)

# Human Acuity on differnt filters

In [1]:
from collections import Counter

def count_matches_per_row(row):
    left = ['L'+str(i+1) for i in range(WORD_NUM)]
    left_side = row[left].str.lower()
    right = ['R'+str(i+1) for i in range(WORD_NUM)]
    right_side = row[right].str.lower()
    
    counter_left = Counter(left_side.dropna()) # dic
    counter_right = Counter(right_side.dropna())
    
    # 计算两侧相同元素的个数，确保每个元素只计算一次
    matches = sum((counter_left & counter_right).values())
    total = sum(counter_left.values())
    
    # char_level
    counter_left_char = Counter(''.join(left_side.dropna().tolist()))
    counter_right_char = Counter(''.join(right_side.dropna().tolist()))
    matches_char = sum((counter_left_char & counter_right_char).values())
    total_char = sum(counter_left_char.values())
    return [matches, total - matches, total_char-matches_char]

In [5]:
xlsx_file_path = '../../data/human/MNREADChartScoreSheet.xlsx'
WORD_NUM = 15
df_human = get_gt(file_path=xlsx_file_path, word_num=WORD_NUM,human=True,L_R=True)
df_human[['match','missing','missing_char']] = df_human.apply(count_matches_per_row, axis=1, result_type='expand')
df_human.reset_index(inplace=True)
df_human['missing_clipped'] = df_human['missing'].clip(upper=10)
# df_human = df_human[df_human['print_size'] > -.3]
# df_human = df_human[df_human['SubID'] != 'N1']
df_human = df_human.groupby(['image_no','a','b','SubID']).apply(lambda x: -0.3 + x['missing_clipped'].sum()*0.01).reset_index()
df_human.rename(columns={0:'acuity'}, inplace=True)
df_human['a'] = df_human['a'].round(3)
df_human['b'] = df_human['b'].round(3)
df_human['a'].replace(0.156, 0.157, inplace=True)
df_human['b'].replace(0.156, 0.157, inplace=True)
df_human['a'].replace(0.287, 0.288, inplace=True)
df_human['b'].replace(0.287, 0.288, inplace=True)
df_human = df_human.groupby(['image_no','a','b']).apply(lambda x: x['acuity'].mean())
df_human = df_human.reset_index()
df_human.rename(columns={0:'acuity'}, inplace=True)
df_human = df_human.groupby(['a','b']).apply(lambda x: x['acuity'].mean())
df_human = df_human.reset_index().rename(columns={0:'acuity'})
df_human.to_csv('../../data/human/human_mnread_acuity.csv',index=False)

  gt['row'] = gt.groupby(['image_no','SubID']).apply(lambda x: np.round((x['print_size'].max()-x['print_size']+0.1)*10)).reset_index().set_index('level_2')['print_size']
  df_human = df_human.groupby(['image_no','a','b','SubID']).apply(lambda x: -0.3 + x['missing_clipped'].sum()*0.01).reset_index()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_human['a'].replace(0.156, 0.157, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, in

In [6]:
xlsx_file_path = '../../data/human/LetterChartScoreSheet.xlsx'
WORD_NUM = 5
df_human = get_gt(file_path=xlsx_file_path, word_num=WORD_NUM,human=True,L_R=True)
df_human[['match','missing','missing_char']] = df_human.apply(count_matches_per_row, axis=1, result_type='expand')
df_human.reset_index(inplace=True)
# df_human = df_human[df_human['print_size'] > -.3]
# df_human = df_human[df_human['SubID'] != 'N1']
df_human['missing_clipped'] = df_human['missing'].clip(upper=WORD_NUM)
df_human = df_human.groupby(['image_no','a','b','SubID']).apply(lambda x: -0.3 + x['missing_clipped'].sum()*0.02).reset_index()
df_human.rename(columns={0:'acuity'}, inplace=True)
df_human['a'] = df_human['a'].round(3)
df_human['b'] = df_human['b'].round(3)
df_human['a'].replace(0.156, 0.157, inplace=True)
df_human['b'].replace(0.156, 0.157, inplace=True)
df_human['a'].replace(0.287, 0.288, inplace=True)
df_human['b'].replace(0.287, 0.288, inplace=True)
df_human = df_human.groupby(['image_no','a','b']).apply(lambda x: x['acuity'].mean())
df_human = df_human.reset_index()
df_human.rename(columns={0:'acuity'}, inplace=True)
df_human = df_human.groupby(['a','b']).apply(lambda x: x['acuity'].mean())
df_human = df_human.reset_index().rename(columns={0:'acuity'})
df_human.to_csv('../../data/human/human_etdrs_acuity.csv',index=False)

  gt['row'] = gt.groupby(['image_no','SubID']).apply(lambda x: np.round((x['print_size'].max()-x['print_size']+0.1)*10)).reset_index().set_index('level_2')['print_size']


NameError: name 'count_matches_per_row' is not defined

# Human Scene Text

In [31]:
import pandas as pd

excel_file_path = '../../data/human/subject_result_v5.xlsx'
xls = pd.ExcelFile(excel_file_path)
sheet_names = xls.sheet_names
dfs = {sheet:pd.read_excel(xls, sheet_name=sheet) for sheet in sheet_names if 'rec' in sheet}
for name, sheet in dfs.items():
    sheet['SubID'] = name.split('_')[0]

In [39]:
df = pd.concat(dfs.values(),axis=0)
df = df.groupby(['SubID','a','b']).apply(lambda x: x['hmean'].mean()).reset_index()
df.rename(columns={0:'hmean'}, inplace=True)
df = df.groupby(['a','b']).apply(lambda x: [x['hmean'].mean(),x['hmean'].std()]).apply(pd.Series).reset_index()
df.rename(columns={0:'human',1:'human_err'}, inplace=True)
df.to_csv('../../data/human/human_totaltext_acuity.csv',index=False)

  df = df.groupby(['SubID','a','b']).apply(lambda x: x['hmean'].mean()).reset_index()
  df = df.groupby(['a','b']).apply(lambda x: [x['hmean'].mean(),x['hmean'].std()]).apply(pd.Series).reset_index()
