# Import and some path preparation

In [None]:
import glob
import os
from collections import Counter
from pathlib import Path
from random import Random
from PIL import Image
import shutil
from IPython.core.display import HTML

In [2]:
main_szum_path = "C:/Users/kubix23/OneDrive/Pulpit/Szum"

In [3]:
datasets = {
    "Recognize Animals": f"{main_szum_path}/datasets/data1",
    "Cigarette Smoker Detection": f"{main_szum_path}/datasets/data2",
    "Smoker Detection [Image] classification Dataset": f"{main_szum_path}/datasets/data3",
}

# Merging test/valid/train in datasets

## Smoker Detection [Image] classification Dataset separation

In [4]:
try:
    os.mkdir(f"{datasets["Smoker Detection [Image] classification Dataset"]}/not_smoking")
except FileExistsError:
    pass
try:
    os.mkdir(f"{datasets["Smoker Detection [Image] classification Dataset"]}/smoking")
except FileExistsError:
    pass

lista = glob.glob(f"{datasets["Smoker Detection [Image] classification Dataset"]}/**/notsmoking*.*",recursive=True)
for file in lista:
    Path(file).rename(f"{datasets["Smoker Detection [Image] classification Dataset"]}/not_smoking/{os.path.basename(file)}")

lista = glob.glob(f"{datasets["Smoker Detection [Image] classification Dataset"]}/**/smoking*.*",recursive=True)
for file in lista:
    Path(file).rename(f"{datasets["Smoker Detection [Image] classification Dataset"]}/smoking/{os.path.basename(file)}")

## Recognize Animals separation

In [5]:
try:
    os.mkdir(f"{datasets["Recognize Animals"]}/other")
except FileExistsError:
    pass

lista = glob.glob(f"{datasets["Recognize Animals"]}/**/*.*",recursive=True)
for file in lista:
    Path(file).rename(f"{datasets["Recognize Animals"]}/other/{os.path.basename(file)}")

# Dataset statistics

In [6]:
datasets_lists = []
for key, value in datasets.items():
    print(f"Processing: {key}")
    smoking_list = glob.glob(f"{value}/smoking/*.*")
    not_smoking_list = glob.glob(f"{value}/not_smoking/*.*")
    other_list = glob.glob(f"{value}/other/*.*")
    all_list = glob.glob(f"{value}/**/*.*",recursive=True)

    image_list = []
    for i in all_list:
        try:
            with Image.open(i) as img:
                image_list.append(img)
        except Exception as e:
            pass
    datasets_lists.append({
        "key": key,
        "smoking_list": smoking_list,
        "not_smoking_list": not_smoking_list,
        "other_list": other_list,
        "image_list": image_list,
        "all_list": all_list,
    })

Processing: Recognize Animals
Processing: Cigarette Smoker Detection
Processing: Smoker Detection [Image] classification Dataset


In [7]:
for data in datasets_lists:
    size_list = [i.size for i in data["image_list"]]
    type_count_list = Counter([i.format for i in data["image_list"]])
    display()
    display(HTML(f"""
        <h1>{data["key"]}</h1>
        <p><b>Smoking count:</b> {len(data["smoking_list"])} ({len(data["smoking_list"])/len(data["all_list"]) * 100}%)</p>
        <p><b>Not smoking count:</b> {len(data["not_smoking_list"])} ({len(data["not_smoking_list"])/len(data["all_list"]) * 100}%)</p>
        <p><b>Other count:</b> {len(data["other_list"])} ({len(data["other_list"])/len(data["all_list"]) * 100}%)</p>
        <p><b>Max size:</b> {max(size_list)}; <b>Min size:</b> {min(size_list)}</p>
        <p><b>Type count:</b></p> <ul>{"".join([f"<li>{k}: {v}</li>" for k,v in type_count_list.items()])}</ul>
    """))

# Merging all datasets to one

In [8]:
paths = [
    f"{main_szum_path}/not_smoking",
    f"{main_szum_path}/smoking",
    f"{main_szum_path}/other"
]

In [9]:
for data in datasets_lists:
    for i in paths:
        try:
            os.mkdir(i)
        except FileExistsError:
            pass
        for file in data[f"{os.path.basename(i)}_list"]:
            shutil.copy(file, f"{i}/{os.path.basename(file)}")

In [10]:
lista = [(os.path.basename(path),len(glob.glob(f"{path}/*.*"))) for path in paths]
all_count = sum([v for _,v in lista])
display(HTML(f"""
    <h1>Merged datasets</h1>
    <ul>{"".join([f"<li><b>{k}:</b> {v} ({v/all_count *100}%)</li>" for k, v in lista])}</ul>
"""))

## Filter images based on their size

In [11]:
for path in paths:
    small_photo = []
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        if os.path.isfile(filepath):
            try:
                with Image.open(filepath) as img:
                    width, height = img.size
                    if width < 200 or height < 200:
                        small_photo.append(filepath)
            except Exception as e:
                print(f"Error in open {filename}: {e}")

    try:
        os.mkdir(f"{path}\\small_photo")
    except FileExistsError:
        pass
    for file in small_photo:
        Path(file).rename(f"{path}/small_photo/{os.path.basename(file)}")

Nie udało się otworzyć filenames.txt: cannot identify image file 'C:/Users/kubix23/OneDrive/Pulpit/Szum/other\\filenames.txt'
Nie udało się otworzyć filenames_elefante_train.txt: cannot identify image file 'C:/Users/kubix23/OneDrive/Pulpit/Szum/other\\filenames_elefante_train.txt'


In [12]:
lista = [(os.path.basename(path),len(glob.glob(f"{path}/*.*"))) for path in paths]
all_count = sum([v for _,v in lista])
display(HTML(f"""
    <h1>Merged datasets after size filter</h1>
    <ul>{"".join([f"<li><b>{k}:</b> {v} ({v/all_count *100}%)</li>" for k, v in lista])}</ul>
"""))

## Filter images based on their format/type

In [13]:
for path in paths:
    try:
        os.mkdir(f"{path}/not_jpg")
    except FileExistsError:
        pass
    for filename in [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]:
        if Path(filename).suffix.lower() not in ['.jpg', '.jpeg']:
            Path(f"{path}/{filename}").rename(f"{path}/not_jpg/{filename}")

In [14]:
lista = [(os.path.basename(path),len(glob.glob(f"{path}/*.*"))) for path in paths]
all_count = sum([v for _,v in lista])
display(HTML(f"""
    <h1>Merged datasets after format filter</h1>
    <ul>{"".join([f"<li><b>{k}:</b> {v} ({v/all_count *100}%)</li>" for k, v in lista])}</ul>
"""))

In [48]:
random = Random()
random.seed(6042025)
selected_images = []

for path in paths:
    try:
        os.mkdir()
    except FileExistsError:
        pass
    filename_list = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    random.shuffle(filename_list)
    print(len(filename_list[:1500]))