In [37]:
import pandas as pd
import os
from imageLabeler import ImageLabeler

def run_labeller():
    # lists files in your data folder
    files = os.listdir('./data')

    # Filter out non-image files
    image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    # only run when are ready to label everything, hard to debug, takes about 25-40 min to do this depending on your speed
    for image_file in image_files:
        image_path = os.path.join('./data', image_file)
        print(f"Labeling image: {image_file}")

        # label the image
        ImageLabeler(image_path)

run = False
if run:
    run_labeller()

In [38]:
df = pd.read_csv('labeled_data.csv')

df = pd.DataFrame([df.columns.values] + df.values.tolist())
df.columns = ['file_name', 'character', 'position']

df['file_name'] = df['file_name'].str.replace('data/', '')
order_values = list(range(1, 12)) * (len(df) // 11) + list(range(1, (len(df) % 11) + 1))

df['order'] = order_values

df['position'] = df['position'].apply(lambda x: tuple(map(float, x.strip('()').split(', '))))

df['top_left'] = df['position'].apply(lambda x: (x[0], x[1]))
df['bottom_right'] = df['position'].apply(lambda x: (x[2], x[3]))

df.drop(columns=['position'], inplace=True)

df.to_csv('labeled_data_cleaned.csv', index=False)

df.head()

Unnamed: 0,file_name,character,order,top_left,bottom_right
0,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,5,1,"(1405.0, 1324.0)","(1495.0, 1383.0)"
1,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,5,2,"(1406.0, 1268.0)","(1497.0, 1320.0)"
2,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,8,3,"(1406.0, 1205.0)","(1496.0, 1265.0)"
3,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,6,4,"(1403.0, 1152.0)","(1492.0, 1206.0)"
4,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,0,5,"(1401.0, 1101.0)","(1491.0, 1152.0)"


In [39]:
data = pd.read_csv('./labeled_data_cleaned.csv')
data.head()

Unnamed: 0,file_name,character,order,top_left,bottom_right
0,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,5,1,"(1405.0, 1324.0)","(1495.0, 1383.0)"
1,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,5,2,"(1406.0, 1268.0)","(1497.0, 1320.0)"
2,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,8,3,"(1406.0, 1205.0)","(1496.0, 1265.0)"
3,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,6,4,"(1403.0, 1152.0)","(1492.0, 1206.0)"
4,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,0,5,"(1401.0, 1101.0)","(1491.0, 1152.0)"


In [40]:
data

Unnamed: 0,file_name,character,order,top_left,bottom_right
0,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,5,1,"(1405.0, 1324.0)","(1495.0, 1383.0)"
1,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,5,2,"(1406.0, 1268.0)","(1497.0, 1320.0)"
2,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,8,3,"(1406.0, 1205.0)","(1496.0, 1265.0)"
3,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,6,4,"(1403.0, 1152.0)","(1492.0, 1206.0)"
4,./CMC_CCM_2018_01_30_23_57_49_55860710841.png,0,5,"(1401.0, 1101.0)","(1491.0, 1152.0)"
...,...,...,...,...,...
325,./CMC_CCM_2018_01_30_23_53_02_55860760788.png,6,7,"(1243.0, 762.0)","(1325.0, 814.0)"
326,./CMC_CCM_2018_01_30_23_53_02_55860760788.png,0,8,"(1245.0, 720.0)","(1320.0, 768.0)"
327,./CMC_CCM_2018_01_30_23_53_02_55860760788.png,7,9,"(1240.0, 662.0)","(1316.0, 716.0)"
328,./CMC_CCM_2018_01_30_23_53_02_55860760788.png,8,10,"(1246.0, 606.0)","(1321.0, 653.0)"


In [41]:
def check_character_alignment(csv_file_path):
    # Load the CSV file
    df = pd.read_csv(csv_file_path)

    # Number of characters to check and length of file extension (e.g., for '.png')
    n_characters = 11
    file_extension_length = 4

    # List to store names of problematic files
    problematic_files = []

    # Process each file in the DataFrame
    for file_name, group in df.groupby('file_name'):
        # Extract the last 11 characters from the file name (excluding the extension)
        last_characters = file_name[-(n_characters + file_extension_length):-file_extension_length]

        # Concatenate characters from the DataFrame in the given order
        concatenated_characters = ''.join(group.sort_values(by='order')['character'].tolist())

        # Check if the concatenated characters match the last 11 characters of the filename
        if concatenated_characters != last_characters:
            problematic_files.append(file_name)

    return problematic_files

# Path to the CSV file
csv_file_path = './labeled_data_cleaned.csv'  # Replace with your actual CSV file path

# Perform the analysis
problematic_files = check_character_alignment(csv_file_path)

# Displaying the problematic file names
print("Problematic Files:", problematic_files)


Problematic Files: []
