In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as logreg
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.svm import LinearSVC as linmach
from sklearn.ensemble import BaggingClassifier as bagg
from sklearn.ensemble import RandomForestClassifier as forest
from sklearn.ensemble import StackingClassifier as stack
import os
import cv2 as cv

In [46]:
path = '/home/nikolai/Downloads/train'
raw_data = os.listdir(path)
data = sorted(raw_data)
data.pop(0)

'.DS_Store'

In [12]:
def extract_histogram(image):
    imag = cv.imread(image)
    im = cv.calcHist([imag], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    im_final = np.reshape(im, [512, 1])
    cv.normalize(im_final, im_final)
    return im_final.flatten()

In [13]:
hist_dict = {}
for each in data:
    p = path + '/' + each
    hist = extract_histogram(p)
    hist_dict[each] = hist

In [14]:
cat_label = {}
dog_label = {}
for each in data:
    if 'cat' in each:
        cat_label[each] = 0
    elif 'dog' in each:
        dog_label[each] = 1

In [24]:
train_data = pd.DataFrame(hist_dict)
train_data = train_data.transpose()

label_ser = pd.Series(cat_label, name = 'label')
dog_ser = pd.Series(dog_label, name = 'label')
label_ser = label_ser.append(dog_ser)
train_data = train_data.merge(label_ser, right_index = True, left_index = True)
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,label
cat.0.jpg,0.242645,0.073122,0.000232,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.087363,...,0.000000,0.0,0.000000,0.0000,0.00000,0.000000,0.000000,0.000000,0.000000,0
cat.1.jpg,0.418818,0.126781,0.000085,0.000000,0.000000,0.0,0.0,0.0,0.005085,0.076441,...,0.000000,0.0,0.000000,0.0000,0.00000,0.000000,0.000000,0.000042,0.000000,0
cat.10.jpg,0.356737,0.184168,0.020616,0.001619,0.000717,0.0,0.0,0.0,0.000041,0.027440,...,0.000000,0.0,0.000000,0.0000,0.00000,0.000000,0.000000,0.000000,0.007726,0
cat.100.jpg,0.001266,0.019995,0.011941,0.001583,0.000158,0.0,0.0,0.0,0.000000,0.002145,...,0.002233,0.0,0.000000,0.0000,0.00000,0.000000,0.000000,0.023354,0.497077,0
cat.101.jpg,0.000000,0.000105,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000366,...,0.000000,0.0,0.002719,0.7748,0.34338,0.038583,0.114442,0.126833,0.008469,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
dog.95.jpg,0.134753,0.010071,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.004053,0.041960,...,0.005025,0.0,0.000000,0.0000,0.00000,0.000000,0.000000,0.012636,0.010754,1
dog.96.jpg,0.122335,0.165961,0.041414,0.016499,0.000102,0.0,0.0,0.0,0.000991,0.147656,...,0.000000,0.0,0.000000,0.0000,0.00000,0.000000,0.000000,0.000000,0.000000,1
dog.97.jpg,0.081336,0.033835,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.006635,...,0.002249,0.0,0.000000,0.0000,0.00000,0.000000,0.000000,0.004499,0.228866,1
dog.98.jpg,0.598071,0.090467,0.006297,0.000019,0.000000,0.0,0.0,0.0,0.000019,0.037595,...,0.000190,0.0,0.000000,0.0000,0.00000,0.000000,0.000000,0.000000,0.010350,1


In [32]:
x_train = train_data.drop(['label'], axis = 1)
y_train = train_data.label

In [33]:
svc_clf = linmach(C = 1.64, random_state = 414)
train_svc = svc_clf.fit(x_train, y_train)

In [34]:
bagg_clf = bagg(base_estimator = dtc(criterion = 'entropy', min_samples_leaf = 10, max_leaf_nodes = 20, random_state = 414), n_estimators = 20, random_state = 414)
train_bagg = bagg_clf.fit(x_train, y_train)

In [35]:
rand_for_clf = forest(n_estimators = 20, criterion = 'entropy', min_samples_leaf = 10, max_leaf_nodes = 20, random_state = 414)
train_forest = rand_for_clf.fit(x_train, y_train)

In [36]:
log_clf = logreg(solver = 'lbfgs', random_state = 414)
train_log = log_clf.fit(x_train, y_train)

In [60]:
estims = [train_svc, train_bagg, train_forest]
estims2 = [('svc', svc_clf), ('bagg' ,bagg_clf), ('forest', rand_for_clf)]
stack_clf = stack(estimators = estims2 , final_estimator = log_clf, cv = 2)

In [61]:
res = stack_clf.fit(x_train, y_train)
res

StackingClassifier(cv=2,
                   estimators=[('svc', LinearSVC(C=1.64, random_state=414)),
                               ('bagg',
                                BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                                                        max_leaf_nodes=20,
                                                                                        min_samples_leaf=10,
                                                                                        random_state=414),
                                                  n_estimators=20,
                                                  random_state=414)),
                               ('forest',
                                RandomForestClassifier(criterion='entropy',
                                                       max_leaf_nodes=20,
                                                       min_samples_leaf=10,
                     

In [63]:
accuracy = res.score(x_train, y_train)
accuracy

0.849

In [67]:
path_test = '/home/nikolai/Downloads/test'
data_test = os.listdir(path_test)
data_test = sorted(data_test)
data_test.pop(0)
data_test.pop(0)

'.ipynb_checkpoints'

In [68]:
test_dict = {}
for each in data_test:
    p = path_test + '/' + each
    im = extract_histogram(p)
    test_dict[each] = im

In [69]:
cat_test = {}
dog_test = {}
for each in data_test:
    if 'cat' in each:
        cat_test[each] = 0
    elif 'dog' in each:
        dog_test[each] = 1

In [70]:
label_test = pd.Series(cat_test, name = 'label')
dogggg = pd.Series(dog_test, name = 'label')
label_test = label_test.append(dogggg)

In [73]:
df_test = pd.DataFrame(test_dict)
df_test = df_test.transpose()
df_test = df_test.merge(label_test, right_index = True, left_index = True)

In [74]:
name_1 = 'cat.1042.jpg'
name_2 = 'cat.1019.jpg'
name_3 = 'dog.1019.jpg'
name_4 = 'dog.1012.jpg'

In [76]:
df_to_test = pd.DataFrame([df_test.loc[name_1], df_test.loc[name_2], df_test.loc[name_3], df_test.loc[name_4]])
df_to_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,label
cat.1042.jpg,0.288885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000232,0.000348,...,0.000155,0.0,0.0,0.0,0.0,0.007069,0.025339,0.067441,0.816325,0.0
cat.1019.jpg,0.018413,0.022763,0.000551,0.0,0.0,0.0,0.0,0.0,0.000613,0.017218,...,0.000827,0.0,0.0,0.0,0.0,0.000184,0.037347,0.011734,0.036795,0.0
dog.1019.jpg,0.293502,0.003384,0.001184,0.001988,0.001311,0.0,0.0,0.0,0.000296,0.002073,...,0.000761,0.0,0.0,0.0,0.0,0.0,0.0,0.007191,0.047754,1.0
dog.1012.jpg,0.561123,0.209176,0.02457,0.032206,0.057108,0.049472,0.004648,0.0,0.012617,0.187594,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015605,1.0


In [82]:
x_to_test = df_to_test.drop(['label'], axis = 1)

In [84]:
proba = res.predict_proba(x_to_test)
proba

array([[0.41251386, 0.58748614],
       [0.53194303, 0.46805697],
       [0.27992543, 0.72007457],
       [0.31244582, 0.68755418]])