In [None]:
import glob
import pandas as pd

In [None]:
dataset_dir = "../data/images/2022-01-05_21:58:53_preprocessed"

In [None]:
paths = glob.glob(f"{dataset_dir}/**/*.png", recursive=True)
df = pd.DataFrame({"path": paths})
df["label"] = df["path"].apply(lambda p: p.split("/")[-2])
df["coarse_label"] = df["label"].apply(lambda l: l.split("_")[0])
df.head()

In [None]:
df["label"].value_counts().plot.bar()

In [None]:
from PIL import Image
from tqdm.notebook import tqdm

tqdm.pandas()

df["image"] = df["path"].progress_apply(Image.open)

In [None]:
import matplotlib.pyplot as plt

df["width"] = df["image"].apply(lambda img: img.size[0])
df["height"] = df["image"].apply(lambda img: img.size[1])
df["area"] = df["width"] * df["height"]

plt.figure()
df.plot.scatter("width", "height")
plt.figure()
df["area"].plot.hist(bins=100)

In [None]:
df = df.loc[df["area"] > 10000, :]
df["area"].plot.hist(bins=100)

In [None]:
import numpy as np


def compute_histograms(img: Image.Image, bins: int = 50) -> np.ndarray:
    img_hsv = np.array(img.convert("HSV"))
    h, _ = np.histogram(img_hsv[:, :, 0], range=(0, 255), bins=bins, density=True)
    s, _ = np.histogram(img_hsv[:, :, 1], range=(0, 255), bins=bins, density=True)
    v, _ = np.histogram(img_hsv[:, :, 2], range=(0, 255), bins=bins, density=True)
    return np.stack([h, s, v])


def show(df: pd.DataFrame, i: int) -> np.ndarray:
    row = df.loc[i, :]
    print(row["label"])
    display(row["image"].resize((128, 128)))
    histograms = compute_histograms(row["image"])
    plt.plot(histograms.T)
    plt.legend(["hue", "saturation", "value"])


In [None]:
df.loc[:, "histograms"] = df["image"].progress_apply(compute_histograms)

In [None]:
df["features"] = df["histograms"].apply(np.ravel)

In [None]:
X = np.stack(df.loc[:, "features"])
c = np.cov(X.T)
plt.figure(figsize=(7, 7))
plt.imshow(np.log(np.abs(c)))

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2)
X_train = np.stack(df_train["features"])
y_train = np.stack(df_train["coarse_label"])
X_test = np.stack(df_test["features"])
y_test = np.stack(df_test["coarse_label"])

def train(estimator):
    estimator.fit(X_train, y_train)
    y_train_pred = estimator.predict(X_train)
    y_test_pred = estimator.predict(X_test)

    display(ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred))
    print(f"Train accuracy: {np.mean(y_train == y_train_pred)}")
    display(ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred))
    print(f"Test in accuracy: {np.mean(y_test == y_test_pred)}")

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import ConfusionMatrixDisplay

gnb = GaussianNB()

train(gnb)

In [None]:
from sklearn.svm import SVC

svc = SVC(gamma=1000)

train(svc)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_features=5)

train(rfc)