# reference:
https://pyimagesearch.com/2016/09/12/softmax-classifiers-explained/

In [3]:
import cv2
import numpy as np
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imutils import paths


In [4]:
def extract_color_histogram(image, bins=(8, 8, 8)):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins, [0, 180, 0, 256, 0, 256])
    hist = cv2.normalize(hist, hist)
    
    return hist.flatten()

In [9]:
dir = '../datasets/dog-vs-cat/train/'
image_paths = os.listdir(dir)
data = []
labels = []
verbose = 1000
total = len(image_paths)

for i, image_path in enumerate(image_paths):
    image = cv2.imread(dir + image_path)
    label = image_path.split('.')[0]
    labels.append(label)
    
    hist = extract_color_histogram(image)
    data.append(hist)
    
    if (i > 0 and i % verbose == 0) or (i == total - 1):
        print(f'[INFO] processed {i+1}/{total}')
    

[INFO] processed 1001/25000
[INFO] processed 2001/25000
[INFO] processed 3001/25000
[INFO] processed 4001/25000
[INFO] processed 5001/25000
[INFO] processed 6001/25000
[INFO] processed 7001/25000
[INFO] processed 8001/25000
[INFO] processed 9001/25000
[INFO] processed 10001/25000
[INFO] processed 11001/25000
[INFO] processed 12001/25000
[INFO] processed 13001/25000
[INFO] processed 14001/25000
[INFO] processed 15001/25000
[INFO] processed 16001/25000
[INFO] processed 17001/25000
[INFO] processed 18001/25000
[INFO] processed 19001/25000
[INFO] processed 20001/25000
[INFO] processed 21001/25000
[INFO] processed 22001/25000
[INFO] processed 23001/25000
[INFO] processed 24001/25000
[INFO] processed 25000/25000


In [11]:
lbl_encoder = LabelEncoder()
labels = lbl_encoder.fit_transform(labels)

X_train, X_valid, y_train, y_valid = train_test_split(np.array(data), labels,  test_size=0.3, random_state=42)

In [12]:
model = SGDClassifier(loss='log', max_iter=10, verbose=10)
model.fit(X_train, y_train)

predict = model.predict(X_valid)
classification_report(y_valid, predict, target_names=lbl_encoder.classes_)

-- Epoch 1
Norm: 13.65, NNZs: 512, Bias: -0.533647, T: 17500, Avg. loss: 0.954380
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 11.63, NNZs: 512, Bias: -0.484599, T: 35000, Avg. loss: 0.664343
Total training time: 0.06 seconds.
-- Epoch 3
Norm: 11.51, NNZs: 512, Bias: -0.797284, T: 52500, Avg. loss: 0.645353
Total training time: 0.09 seconds.
-- Epoch 4
Norm: 11.36, NNZs: 512, Bias: -0.433851, T: 70000, Avg. loss: 0.635855
Total training time: 0.12 seconds.
-- Epoch 5
Norm: 11.10, NNZs: 512, Bias: -0.569512, T: 87500, Avg. loss: 0.630767
Total training time: 0.16 seconds.
-- Epoch 6
Norm: 11.13, NNZs: 512, Bias: -0.496802, T: 105000, Avg. loss: 0.627799
Total training time: 0.19 seconds.
-- Epoch 7
Norm: 11.14, NNZs: 512, Bias: -0.368682, T: 122500, Avg. loss: 0.626257
Total training time: 0.22 seconds.
-- Epoch 8
Norm: 11.14, NNZs: 512, Bias: -0.238920, T: 140000, Avg. loss: 0.624563
Total training time: 0.25 seconds.
-- Epoch 9
Norm: 11.16, NNZs: 512, Bias: -0.355309, T: 157500



'              precision    recall  f1-score   support\n\n         cat       0.70      0.37      0.49      3711\n         dog       0.58      0.84      0.69      3789\n\n    accuracy                           0.61      7500\n   macro avg       0.64      0.61      0.59      7500\nweighted avg       0.64      0.61      0.59      7500\n'