In [1]:
from PIL import Image
import csv
import numpy as np
import cv2
from skimage.io import imread, imshow
from skimage.filters import prewitt_h, prewitt_v
from skimage.feature import hog

train_files = []
train_input = np.ndarray(shape=(50000, 4964))
train_output = []

test_files = []
test_input = np.ndarray(shape=(10000, 4964))
test_output = []

def extract_features(file: str) -> np.ndarray:
    image_array = np.ndarray(0)

    img = imread(file)
    image_array = np.append(image_array, np.asarray(img, dtype=int).reshape(1024*3))
    # print(image_array.reshape((1024, 3)).shape)

    # image = imread(file, as_gray=True)
    # edges_horizontal = prewitt_h(image)
    # edges_vertical = prewitt_v(image)
    # image_array = np.append(image_array, edges_horizontal)
    # image_array = np.append(image_array, edges_vertical)

    # image = cv2.imread(file)
    # background_subtractor = cv2.createBackgroundSubtractorMOG2()
    # foreground_mask = background_subtractor.apply(image)
    # image_array = np.append(image_array, np.asarray(foreground_mask[:,:], dtype=int))

    img = cv2.imread(file)
    gray_img = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
    sift = cv2.SIFT_create()
    _, descriptors = sift.detectAndCompute((gray_img).astype("uint8"), None)
    # image_array = np.append(image_array, np.asarray(keypoints))
    # image_array = np.append(image_array, np.asarray(descriptors))
    if descriptors is None:
        image_array = np.append(image_array, np.zeros((128,)))
    else:
        image_array = np.append(image_array, np.mean(descriptors, axis=0))

    winSize = (64,64)
    blockSize = (16,16)
    blockStride = (8,8)
    cellSize = (8,8)
    nbins = 9
    derivAperture = 1
    winSigma = 4.
    histogramNormType = 0
    L2HysThreshold = 2.0000000000000001e-01
    gammaCorrection = 0
    nlevels = 64
    hog = cv2.HOGDescriptor(winSize,blockSize,blockStride,cellSize,nbins,derivAperture,winSigma,
                        histogramNormType,L2HysThreshold,gammaCorrection,nlevels)
    #compute(img[, winStride[, padding[, locations]]]) -> descriptors
    winStride = (8,8)
    padding = (8,8)
    locations = ((10,20),)
    hist = hog.compute(img,winStride,padding,locations)
    image_array = np.append(image_array, np.asarray(hist))

    # img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    # s = img_hsv[:, :, 1]
    # s = np.where(s < 100, 0, 1)
    # v = (img_hsv[:, :, 2]+127) % 255
    # v = np.where(v>127, 1, 0)
    # foreground = np.where(s+v > 0, 1, 0).astype(np.uint8)
    # background  = np.where(foreground==0, 255, 0).astype(np.uint8)
    # background = cv2.cvtColor(background, cv2.COLOR_GRAY2BGR)
    # foreground = cv2.bitwise_and(img, img, mask=foreground)
    # image_array = np.append(image_array, np.asarray(background))
    # image_array = np.append(image_array, np.asarray(foreground))

    # blur_img = cv2.GaussianBlur(gray_img, (3, 3), 0)
    # # sobelx = cv2.Sobel(src=blur_img, ddepth=cv2.CV_64F, dx=1, dy=0, ksize=5)
    # # sobely = cv2.Sobel(src=blur_img, ddepth=cv2.CV_64F, dx=0, dy=1, ksize=5)
    # # sobelxy = cv2.Sobel(src=blur_img, ddepth=cv2.CV_64F, dx=1, dy=1, ksize=5)
    # edges = cv2.Canny(blur_img, 100, 200)
    # image_array = np.append(image_array, np.asarray(edges))

    # dst = cv2.cornerHarris(gray_img, 2, 3, 0.04)
    # image_array = np.append(image_array, np.asarray(dst))

    # img = imread(file, as_gray=True)
    # img_hog, img_hog_img = hog(
    # img, pixels_per_cell=(14,14), 
    # cells_per_block=(2, 2), 
    # orientations=9, 
    # visualize=True, 
    # block_norm='L2-Hys')
    # image_array = np.append(image_array, img_hog)
    return image_array

with open("images/train.csv") as trainFile:
    reader = csv.reader(trainFile)
    for (row_no, row) in enumerate(reader):
        if row_no == 0:
            continue
        else:
            train_files.append("images/train/"+row[0])
            train_output.append(row[1])

for (i, file) in enumerate(train_files):
    train_input[i] = extract_features(file)

with open("images/test.csv") as testFile:
    reader = csv.reader(testFile)
    for (row_no, row) in enumerate(reader):
        if row_no == 0:
            continue
        else:
            test_files.append("images/test/"+row[0])
            test_output.append(row[1])

for (i, file) in enumerate(test_files):
    test_input[i] = extract_features(file)


In [9]:
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

# random_indices = random.sample(range(50000), 3000)

# estimator_KNN = KNeighborsClassifier(algorithm='auto')

# parameters_KNN = {
#     'n_neighbors': (1,10, 1),
#     'leaf_size': (20,40,1),
#     'p': (1,2),
#     'weights': ('uniform', 'distance'),
#     'metric': ('minkowski', 'chebyshev'),
# }

# grid_search_KNN = GridSearchCV(estimator=estimator_KNN, param_grid=parameters_KNN, scoring='accuracy', n_jobs=2, cv=5)
# grid_search_KNN.fit(train_input[:50000], train_output).predict(test_input)
# print(grid_search_KNN.best_params_)
# print(grid_search_KNN.best_score_)

pipe1 = Pipeline([("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=2, weights="distance", n_jobs=5))]).fit(train_input[:50000], train_output)
# pipe2 = Pipeline([("scaler", StandardScaler()), ("svm", LinearSVC(C=1))]).fit(train_input[:40000], train_output[:40000])
# pipe3 = Pipeline([("adaBoost", AdaBoostClassifier())]).fit(train_input[:5000], train_output[:5000])

classifier = pipe1

In [None]:
def get_accuracy(ys, ys_pred):
    # assert ys.shape == ys_pred.shape
    # assert ys.ndim == 1
    return np.sum(ys == ys_pred) / len(ys)

get_accuracy(train_output[40001:50000], classifier.predict(train_input[40001:50000]))

In [10]:
import os

filename = "images/test"
suffix = 1
extension = ".csv"

while os.path.exists(filename+str(suffix)+extension):
    suffix += 1

filepath = filename+str(suffix)+extension

with open(filepath, 'w') as outputFile:
    fieldNames = ["im_name", "label"]
    writer = csv.DictWriter(outputFile, fieldnames=fieldNames)
    writer.writeheader()

with open(filepath, 'w') as outputFile:
    fieldNames = ["im_name", "label"]
    writer = csv.DictWriter(outputFile, fieldnames=fieldNames)

    writer.writeheader()
    for i, label in enumerate(classifier.predict(test_input)[:10000]):
        writer.writerow({"im_name": test_files[i].split("/")[2], "label": label})