In [1]:
%reload_ext autoreload
%autoreload 2
import os
import cv2
import numpy as np
import ray 
import matplotlib.pyplot as plt

from tqdm import tqdm
from inout import *
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

from scipy.spatial. distance import cosine, euclidean
from skimage.feature import hog

from random import choices

In [2]:
files = os.listdir("flowers/")

## Geometric Moment

In [3]:
@ray.remote
def get_moment(file, p, q):
    img = read_image(file).mean(axis=2)
    x = np.arange(img.shape[0], dtype=int)
    y = np.arange(img.shape[1], dtype=int)
    return np.sum( (x**p) * (y**q) * img)


def extract_moment_from_files(files, p, q):
    X = []
    
    res = []
    for file in files:
        res.append( get_moment.remote(file, p, q) )
    out = ray.get(res)
    
    for feat in tqdm(out):
        X.append(feat)
    return np.array(X)

In [4]:
ray.init()

2023-04-23 18:43:26,584	INFO worker.py:1538 -- Started a local Ray instance.


0,1
Python version:,3.9.7
Ray version:,2.2.0


In [6]:
m00s = extract_moment_from_files(files, 0, 0)
m10s = extract_moment_from_files(files, 1, 0)
m01s = extract_moment_from_files(files, 0, 1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 733/733 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 733/733 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 733/733 [00:00<?, ?it/s]


In [7]:
x_centroids = m10s/m00s
y_centroids = m01s/m00s

In [8]:
ray.shutdown()

In [9]:
def get_central_moment(img, xcent, ycent, p, q):
    x = np.arange(img.shape[0], dtype=int)
    y = np.arange(img.shape[1], dtype=int)
    
    return np.sum(
        (x-xcent)**p * (y-ycent)**q * img
    )

def extract_central_moments_from_files(files, xcents, ycents, p, q):
    moments = []
    for i, file in tqdm(enumerate(files)):
        img = read_image(file).mean(axis=2)
        moments.append( get_central_moment(img, xcents[i], ycents[i], p, q))
    return np.array(moments)

In [10]:
u00s = extract_central_moments_from_files(files, x_centroids, y_centroids, 0, 0).reshape(-1,1)
u11s = extract_central_moments_from_files(files, x_centroids, y_centroids, 1, 1).reshape(-1,1)
u20s = extract_central_moments_from_files(files, x_centroids, y_centroids, 2, 0).reshape(-1,1)
u02s = extract_central_moments_from_files(files, x_centroids, y_centroids, 0, 2).reshape(-1,1)
u21s = extract_central_moments_from_files(files, x_centroids, y_centroids, 2, 1).reshape(-1,1)
u12s = extract_central_moments_from_files(files, x_centroids, y_centroids, 1, 2).reshape(-1,1)
u30s = extract_central_moments_from_files(files, x_centroids, y_centroids, 3, 0).reshape(-1,1)
u03s = extract_central_moments_from_files(files, x_centroids, y_centroids, 0, 3).reshape(-1,1)

733it [00:24, 30.24it/s]
733it [00:23, 30.96it/s]
733it [00:30, 23.78it/s]
733it [00:27, 26.25it/s]
733it [00:23, 31.47it/s]
733it [00:22, 32.04it/s]
733it [00:23, 31.09it/s]
733it [00:22, 32.75it/s]


In [13]:
geometric_moments = np.concatenate((u00s, u11s, u20s, u02s, u21s, u12s, u30s, u03s), axis=1)

In [14]:
norm_geometric_moments = MinMaxScaler().fit_transform(geometric_moments)

In [18]:
k=0
print(files[k])
results = np.argsort([
    euclidean(
        geometric_moments[k],
        geometric_moments[i]
    ) for i in range(geometric_moments.shape[0])
])[:15]
results

bougainvillea_00002.jpg


array([  0, 439, 614, 559, 552, 502, 335, 344, 368, 616, 453, 136, 681,
       290, 455], dtype=int64)

In [19]:
k=0
print(files[k])
results = np.argsort([
    euclidean(
        norm_geometric_moments[k],
        norm_geometric_moments[i]
    ) for i in range(geometric_moments.shape[0])
])[:15]
results

bougainvillea_00002.jpg


array([  0, 439, 211, 711, 614,  55, 686, 705, 453, 406, 411,  54, 682,
        47,  34], dtype=int64)

Through linear distance it does not seem that this feature alone is enough to distinguish the different types of flowers in the dataset

## Histogram of Gradients

In [3]:
@ray.remote
def get_hog(file):
    img = read_image(file)
    return hog(
        img, 
        channel_axis=2
    )

def extract_hog_from_files(files):
    data = []
    
    res = []
    for file in tqdm(files):
        res.append(get_hog.remote(file))
    out = ray.get(res)
    
    for feat_vec in out:
        data.append(feat_vec)
    return np.array(data)

In [4]:
ray.init()

2023-04-24 11:53:25,192	INFO worker.py:1538 -- Started a local Ray instance.


0,1
Python version:,3.9.7
Ray version:,2.2.0


In [5]:
hog_features = extract_hog_from_files(files)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 733/733 [00:00<00:00, 1800.46it/s]


In [6]:
ray.shutdown()

In [7]:
norm_hog = MinMaxScaler().fit_transform(hog_features)

In [8]:
classes = list(
    map(lambda f: '_'.join(f.split("_")[:-1]), files )
)

In [9]:
encoder = LabelEncoder().fit(classes)

In [19]:
def to_classes(seq):
    return encoder.transform([classes[i] for i in seq])

In [11]:
k=200
print(files[k])
results = np.argsort([
    euclidean(
        hog_features[k],
        hog_features[i]
    ) for i in range(hog_features.shape[0])
])[:15]
to_classes(results)

garden_roses_00052.jpg


array([2, 2, 1, 7, 2, 2, 2, 0, 0, 2, 2, 0, 0, 2, 1])

In [12]:
k=200
print(files[k])
results = np.argsort([
    euclidean(
        norm_hog[k],
        norm_hog[i]
    ) for i in range(hog_features.shape[0])
])[:15]
to_classes(results)

garden_roses_00052.jpg


array([2, 2, 1, 7, 2, 2, 2, 1, 0, 2, 2, 0, 2, 1, 2])

It seems that Histogram of gradients has gotten much better results. Similar classes have similar representations <br>

Will use this representation for flower classification and check the results

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [71]:
X_train, X_test, y_train, y_test = train_test_split(hog_features, encoder.transform(classes), random_state=123, test_size=0.2)

In [72]:
X_train.shape

(586, 72900)

In [73]:
pca = PCA(n_components=0.9, svd_solver="full").fit(X_train)

In [74]:
pca.explained_variance_ratio_.sum()

0.9004787385951388

In [75]:
pca.transform(X_train).shape

(586, 423)

In [76]:
def t(X):
    return pca.transform(X)

### Logistic Regression

In [77]:
lr = LogisticRegression(max_iter=999999).fit(t(X_train), y_train)
lr.score(t(X_train), y_train)

1.0

In [78]:
lr.score(t(X_test), y_test)

0.43537414965986393

### SVM

In [68]:
svm = SVC().fit(t(X_train), y_train)
svm.score(t(X_train), y_train)

0.9573378839590444

In [69]:
svm.score(t(X_test), y_test)

0.2585034013605442