In [None]:
import os
from pathlib import Path
from collections import Counter
import random
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

In [None]:
data_dir = Path(r'D:\workspace\Self-learn\Future Career\Pytorch\vietnamese-landmark-classifier\data')
train_dir = data_dir/'train'

image_path_list = list(train_dir.glob("*/*.jpg"))

In [None]:
# Print a random image
random.seed(42)
random_image_path = random.choice(image_path_list)
random_image = Image.open(random_image_path)
print(random_image_path)
random_image

In [None]:
# Plot 4 random images
random.seed(4) 
random_image_paths = random.sample(image_path_list, 4)

fig, axes = plt.subplots(2,2,figsize = (10,10))

for ax, image_path in zip(axes.flatten(), random_image_paths):
    img = Image.open(image_path)
    ax.imshow(img)
    ax.axis('off')
    ax.set_title(image_path.parent.name)

plt.tight_layout()
plt.show()

In [None]:
# Number of images per class

class_counts = {p.name: len(list(p.glob("*.jpg"))) for p in train_dir.iterdir() if p.is_dir()}

plt.bar(class_counts.keys(), class_counts.values())
plt.xticks(rotation = 45, ha='right')
plt.title("Number of images per class")
plt.ylabel("Number of images")
plt.show()

In [None]:
# Size distribution

sizes = []
for image_path in list(train_dir.glob('*/*.jpg')):
    with Image.open(image_path) as img:
        sizes.append(img.size)

size_counts = Counter(sizes)
print("Top 5 most popular sizes:\n{}".format(size_counts.most_common(5)))

In [None]:
# Check for bad images
bad_files = []
for img_path in train_dir.glob("*/*"):
    try:
        with Image.open(img_path) as img:
            img.verify()  # chỉ verify
    except Exception as e:
        bad_files.append(img_path)
print("Bad images:", bad_files)