In [None]:
import os
from PIL import Image
import polars as pl
import altair as alt

train_csv = '/root/documentclassification/JL/datasets/train.csv'
train_img_dir = '/root/documentclassification/JL/datasets/train'
test_img_dir = '/root/documentclassification/JL/datasets/test'
meta_csv = '/root/documentclassification/JL/datasets/meta.csv'

train_target = pl.read_csv(train_csv) 
meta = pl.read_csv(meta_csv)

def get_image_sizes_from_dir(image_dir):
    sizes = []
    fnames = []
    for fname in os.listdir(image_dir):
        if fname.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'tiff')):
            try:
                with Image.open(os.path.join(image_dir, str(fname))) as img:
                    sizes.append(img.size)
                    fnames.append(fname)
            except:
                pass
    df = pl.DataFrame(sizes, schema=['Width', 'Height'],orient="row")
    df = df.with_columns(
        pl.Series(fnames).alias('name')
    )
    return df

train_df = get_image_sizes_from_dir(train_img_dir)
test_df = get_image_sizes_from_dir(test_img_dir)

train_df = train_df.with_columns(pl.lit('Train').alias('Dataset'))
test_df = test_df.with_columns(pl.lit('Test').alias('Dataset'))

all_df = pl.concat([
    train_df.select(['name', 'Width', 'Height', 'Dataset']),
    test_df.select(['name', 'Width', 'Height', 'Dataset'])
])

dictionary = dict(zip(train_target['ID'], train_target['target']))
meta_dict = dict(zip(meta['target'], meta['class_name']))

all_df = all_df.with_columns(
    pl.col('name').map_elements(lambda x: dictionary.get(x, -1),return_dtype=pl.Int64).alias('target')
).with_columns(
    pl.col('target').map_elements(lambda x: meta_dict.get(x, 'None'),return_dtype=pl.String).alias('class_name')
)

In [21]:
all_df.plot.bar(
    x='Dataset',
    y='count():N'
).properties(
    width=500
)

In [22]:
all_df.head()

name,Width,Height,Dataset,target,class_name
str,i64,i64,str,i64,str
"""5eb8d197d228609e.jpg""",443,591,"""Train""",16,"""vehicle_registration_plate"""
"""716c2fced083a6a6.jpg""",443,591,"""Train""",12,"""prescription"""
"""37f9414beea68229.jpg""",443,591,"""Train""",14,"""statement_of_opinion"""
"""c8ef1b2fdb8dbace.jpg""",443,591,"""Train""",12,"""prescription"""
"""4606a9ccbc65f3e5.jpg""",443,591,"""Train""",10,"""payment_confirmation"""


In [23]:
all_df.filter(pl.col('Dataset') == 'Train').plot.bar(
    x='class_name',
    y='count():N'
).properties(
    width=500
)

In [32]:
dimension = all_df.filter(pl.col('Dataset') == 'Train').group_by('class_name')

wid = dimension.agg(pl.col(['Width','Height']).median()).plot.bar(
    x='class_name',
    y='Width'
)

hig = dimension.agg(pl.col(['Width','Height']).median()).plot.bar(
    x='class_name',
    y='Height'
)

wid | hig

rotate, crop, augraphy, cutout, perhaps mixup

In [None]:
import os

dir_path = 'datasets/train_augmented'

num_files = len([f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f))])

print(f"Number of files in {dir_path}: {num_files}")