# Import and some path preparation

In [1]:
import glob
import os
from collections import Counter
from pathlib import Path
from random import Random

import numpy as np
from PIL import Image
import shutil
from IPython.core.display import HTML
from matplotlib import pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
main_szum_path = "C:/Users/kubix23/OneDrive/Pulpit/Szum"

In [3]:
datasets = {
    "Recognize Animals": f"{main_szum_path}/datasets/data1",
    "Unsplash": f"{main_szum_path}/datasets/data4",
    "Smoker Detection [Image] classification Dataset": f"{main_szum_path}/datasets/data3",
}

# Merging test/valid/train in datasets

## Smoker Detection [Image] classification Dataset separation

In [11]:
try:
    os.mkdir(f"{datasets["Smoker Detection [Image] classification Dataset"]}/not_smoking")
except FileExistsError:
    pass
try:
    os.mkdir(f"{datasets["Smoker Detection [Image] classification Dataset"]}/smoking")
except FileExistsError:
    pass

lista = glob.glob(f"{datasets["Smoker Detection [Image] classification Dataset"]}/**/notsmoking*.*",recursive=True)
for file in lista:
    Path(file).rename(f"{datasets["Smoker Detection [Image] classification Dataset"]}/not_smoking/{os.path.basename(file)}")

lista = glob.glob(f"{datasets["Smoker Detection [Image] classification Dataset"]}/**/smoking*.*",recursive=True)
for file in lista:
    Path(file).rename(f"{datasets["Smoker Detection [Image] classification Dataset"]}/smoking/{os.path.basename(file)}")

## Recognize Animals separation

In [12]:
try:
    os.mkdir(f"{datasets["Recognize Animals"]}/none")
except FileExistsError:
    pass

lista = glob.glob(f"{datasets["Recognize Animals"]}/**/*.*",recursive=True)
for file in lista:
    Path(file).rename(f"{datasets["Recognize Animals"]}/none/{os.path.basename(file)}")

# Dataset statistics

In [6]:
datasets_lists = []
for key, value in datasets.items():
    print(f"Processing: {key}")
    smoking_list = glob.glob(f"{value}/smoking/*.*")
    not_smoking_list = glob.glob(f"{value}/not_smoking/*.*")
    none_list = glob.glob(f"{value}/none/*.*")
    all_list = glob.glob(f"{value}/**/*.*",recursive=True)

    image_list = []
    for i in all_list:
        try:
            with Image.open(i) as img:
                img.convert('RGB')
                image_list.append(img)
        except Exception as e:
            pass
    datasets_lists.append({
        "key": key,
        "smoking_list": smoking_list,
        "not_smoking_list": not_smoking_list,
        "none_list": none_list,
        "image_list": image_list,
        "all_list": all_list,
    })

Processing: Recognize Animals
Processing: Unsplash
Processing: Smoker Detection [Image] classification Dataset


In [14]:
for data in datasets_lists:
    size_list = [i.size for i in data["image_list"]]
    type_count_list = Counter([i.format for i in data["image_list"]])
    display()
    display(HTML(f"""
        <h1>{data["key"]}</h1>
        <p><b>Smoking count:</b> {len(data["smoking_list"])} ({len(data["smoking_list"])/len(data["all_list"]) * 100}%)</p>
        <p><b>Not smoking count:</b> {len(data["not_smoking_list"])} ({len(data["not_smoking_list"])/len(data["all_list"]) * 100}%)</p>
        <p><b>none count:</b> {len(data["none_list"])} ({len(data["none_list"])/len(data["all_list"]) * 100}%)</p>
        <p><b>Max size:</b> {max(size_list)}; <b>Min size:</b> {min(size_list)}</p>
        <p><b>Type count:</b></p> <ul>{"".join([f"<li>{k}: {v}</li>" for k,v in type_count_list.items()])}</ul>
    """))

# Merging all datasets to one

In [4]:
paths = [
    f"{main_szum_path}/not_smoking",
    f"{main_szum_path}/smoking",
    f"{main_szum_path}/none"
]

In [16]:
for data in datasets_lists:
    for i in paths:
        try:
            os.mkdir(i)
        except FileExistsError:
            pass
        for file in data[f"{os.path.basename(i)}_list"]:
            shutil.copy(file, f"{i}/{os.path.basename(file)}")

In [17]:
lista = [(os.path.basename(path),len(glob.glob(f"{path}/*.*"))) for path in paths]
all_count = sum([v for _,v in lista])
display(HTML(f"""
    <h1>Merged datasets</h1>
    <ul>{"".join([f"<li><b>{k}:</b> {v} ({v/all_count *100}%)</li>" for k, v in lista])}</ul>
"""))

## Filter images based on their size

In [18]:
for path in paths:
    small_photo = []
    for filename in os.listdir(path):
        filepath = os.path.join(path, filename)
        if os.path.isfile(filepath):
            try:
                with Image.open(filepath) as img:
                    width, height = img.size
                    if width < 200 or height < 200:
                        small_photo.append(filepath)
            except Exception as e:
                print(f"Error in open {filename}: {e}")

    try:
        os.mkdir(f"{path}\\small_photo")
    except FileExistsError:
        pass
    for file in small_photo:
        Path(file).rename(f"{path}/small_photo/{os.path.basename(file)}")

Error in open filenames.txt: cannot identify image file 'C:/Users/kubix23/OneDrive/Pulpit/Szum/none\\filenames.txt'
Error in open filenames_elefante_train.txt: cannot identify image file 'C:/Users/kubix23/OneDrive/Pulpit/Szum/none\\filenames_elefante_train.txt'


In [19]:
lista = [(os.path.basename(path),len(glob.glob(f"{path}/*.*"))) for path in paths]
all_count = sum([v for _,v in lista])
display(HTML(f"""
    <h1>Merged datasets after size filter</h1>
    <ul>{"".join([f"<li><b>{k}:</b> {v} ({v/all_count *100}%)</li>" for k, v in lista])}</ul>
"""))

## Filter images based on their format/type

In [20]:
for path in paths:
    try:
        os.mkdir(f"{path}/not_jpg")
    except FileExistsError:
        pass
    for filename in [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]:
        if Path(filename).suffix.lower() not in ['.jpg', '.jpeg']:
            Path(f"{path}/{filename}").rename(f"{path}/not_jpg/{filename}")

In [21]:
lista = [(os.path.basename(path),len(glob.glob(f"{path}/*.*"))) for path in paths]
all_count = sum([v for _,v in lista])
display(HTML(f"""
    <h1>Merged datasets after format filter</h1>
    <ul>{"".join([f"<li><b>{k}:</b> {v} ({v/all_count *100}%)</li>" for k, v in lista])}</ul>
"""))

In [9]:
def get_all_images():
    result = []
    for i, path in enumerate(paths):
        for j in glob.glob(f"{path}/*.*"):
            try:
                with Image.open(j) as img:
                    result.append((np.array(img), i))
            except Exception as e:
                pass
    return result
image_list = get_all_images()

In [10]:
def standardization(dataset):
    mean = np.mean((np.concatenate([arr.flatten() for arr in [i for i,j in image_list]])))
    std = np.std((np.concatenate([arr.flatten() for arr in [i for i,j in image_list]])))
    return [((image - mean) / std,label) for image, label in image_list]

def normalization(dataset):
    return [(image / 255.0, label) for image, label in image_list]

display(standardization(image_list))

[(array([[[ 1.16049357,  1.03415599,  0.93589342],
          [ 1.14645606,  1.02011848,  0.92185591],
          [ 1.13241855,  1.00608097,  0.87974338],
          ...,
          [ 1.45528127,  0.27613048,  0.12171788],
          [ 1.46931878,  0.3042055 ,  0.10768037],
          [ 1.44124376,  0.27613048,  0.07960535]],
  
         [[ 1.17453108,  1.0481935 ,  0.94993093],
          [ 1.17453108,  1.0481935 ,  0.94993093],
          [ 1.16049357,  1.03415599,  0.93589342],
          ...,
          [ 1.46931878,  0.29016799,  0.13575539],
          [ 1.46931878,  0.3042055 ,  0.10768037],
          [ 1.46931878,  0.29016799,  0.13575539]],
  
         [[ 1.2026061 ,  1.10434353,  0.99204346],
          [ 1.18856859,  1.09030602,  0.97800595],
          [ 1.18856859,  1.06223101,  0.96396844],
          ...,
          [ 1.48335629,  0.3042055 ,  0.1497929 ],
          [ 1.48335629,  0.3042055 ,  0.1497929 ],
          [ 1.48335629,  0.33228052,  0.17786792]],
  
         ...,
  
        

In [24]:
def create_split(dataset):
    train_set = dataset[:int(len(dataset)*0.8)]
    val_set = dataset[int(len(dataset)*0.8):int(len(dataset)*0.9)]
    test_set = dataset[int(len(dataset)*0.9):int(len(dataset)-1)]
    return train_set, val_set, test_set

In [26]:
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    shear_range=0.2,
    fill_mode='nearest'
)
for path in paths:
    try:
        os.mkdir(f"{path}/augmented")
        i = 0
        gen = datagen.flow_from_directory(path, save_to_dir=path+'/augmented',classes=['.'], batch_size=1)
        for i in range(5):
            list(gen)
    except FileExistsError:
        pass

Found 1060 images belonging to 1 classes.


KeyboardInterrupt: 

In [None]:
image_list = get_all_images()

def create_split_2(dataset):
    temp = {}
    for array, label in dataset:
        temp[label].append((array, label))
    newdataset = []
    for label,values in temp.values():
        newdataset.extend(values[:1060])
    standardization_image_list = standardization(newdataset)
    return create_split(standardization_image_list)

In [None]:
def create_split_3(dataset):
    temp = {}
    for array, label in dataset:
        temp[label].append((array, label))
    dataset = []
    for label,values in temp.values():
        dataset.extend(values[:1060])
    standardization_image_list = standardization(dataset)
    train_set = dataset[:int(len(standardization_image_list)*0.9)]
    val_set = dataset[int(len(standardization_image_list)*0.8):int(len(standardization_image_list)*0.9)]
    test_set = dataset[int(len(standardization_image_list)*0.9):int(len(standardization_image_list)-1)]
    return train_set, val_set, test_set