In [19]:
import os
import re
import random
import itertools

import tensorflow as tf
import tensorflow.python.platform
from tensorflow.python.platform import gfile
import numpy as np
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn import model_selection
import sklearn.linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC, LinearSVC
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import scipy.linalg

In this exercise, you are going to train support vector machines (SVMs) using scikit-learn and the data competition project dataset. You will consider here all classes in the dataset. You may work on this exercise on your own computer first. Note, however, that you need AWS to run the experiments for the last two parts of this exercise.

### Warm-up

- In a one-vs-one fashion, for each pairs of classes, train a linear SVM classifier using scikit-learn's function LinearSVC, with the default value for the regularization parameter. Compute the multi-class misclassification error obtained using these classifiers trained in a one-vs-one fashion.

In [39]:
def list_files(dir):
    folders = []
    for root, dirs, files in os.walk(dir):
        for r in dirs:
            folders.append(r)
    return folders

def list_directories(folders):
    images_dirs = []
    for i in range(len(folders)):
        images_d = '/Users/jingyany/Desktop/17Spring/Data558/data competation/train/'+folders[i]+'/'
        images_dirs.append(images_d)
    return images_dirs

def images_lists(images_dirs):
    list_images = []
    for i in range(len(images_dirs)):
        images_dir = images_dirs[i]
        for f in os.listdir(images_dir):
            if re.search('jpg|JPG', f):
                list_images.append(images_dir + f)
    return list_images

In [22]:
model_dir = '/Users/jingyany/models/tutorials/image/imagenet/TUTORIAL_DIR/imagenet'
dir = '/Users/jingyany/Desktop/17Spring/Data558/data competation/train'
folders = list_files(dir)
classes_pairs_tuple = list(itertools.combinations(folders, 2))
classes_pairs_lists = [list(elem) for elem in comb_pairs_tuple]
#for i in range(len(classes_pairs_lists)):
    #list_directories(classes_pairs_lists[i])
#classes_dirs = list_directories(folders)
#list_images = images_lists(classes_dirs)

In [73]:
len(classes_pairs_lists)

10296

In [35]:
def list_directories_temp(folders):
    images_dirs = []
    for i in range(len(folders)):
        images_d = '/Users/jingyany/Desktop/17Spring/Data558/data competation/train/'+folders[i]+'/'
        images_dirs.append(images_d)
    return images_dirs

In [44]:
tem_pair = classes_pairs_lists[7]
temp_dir_list = list_directories(tem_pair)
pairs_imagelists = images_lists(temp_dir_list)

In [68]:
def create_graph():
    with gfile.FastGFile(os.path.join(model_dir, 'classify_image_graph_def.pb'), 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        _ = tf.import_graph_def(graph_def, name='')
        
def extract_features(list_images):
    nb_features = 2048
    features = np.empty((len(list_images),nb_features))
    labels = []

    create_graph()

    with tf.Session() as sess:

        next_to_last_tensor = sess.graph.get_tensor_by_name('pool_3:0')

    for ind, image in enumerate(list_images):
        #if (ind%100 == 0):
        print('Processing %s...' % (image))
        if not gfile.Exists(image):
            tf.logging.fatal('File does not exist %s', image)

        image_data = gfile.FastGFile(image, 'rb').read()
        predictions = sess.run(next_to_last_tensor,{'DecodeJpeg/contents:0': image_data})
        features[ind,:] = np.squeeze(predictions)
        labels.append(re.split('_\d+',image.split('/')[-2].split('.')[0])[0])
    labels = list(map(int, labels))

    return features, labels

def redefine_lables(labels):
    for i in range(labels[0:30].shape[0]):
        labels[i] =1
    for i in range(labels[0:30].shape[0]):
        labels[30+i] = -1
    
    return labels

In [75]:
features1,labels1 = extract_features(pairs_imagelists)

Processing /Users/jingyany/Desktop/17Spring/Data558/data competation/train/001.Black_footed_Albatross/Black_Footed_Albatross_0003_796136.jpg...
Processing /Users/jingyany/Desktop/17Spring/Data558/data competation/train/001.Black_footed_Albatross/Black_Footed_Albatross_0005_796090.jpg...
Processing /Users/jingyany/Desktop/17Spring/Data558/data competation/train/001.Black_footed_Albatross/Black_Footed_Albatross_0009_34.jpg...
Processing /Users/jingyany/Desktop/17Spring/Data558/data competation/train/001.Black_footed_Albatross/Black_Footed_Albatross_0014_89.jpg...
Processing /Users/jingyany/Desktop/17Spring/Data558/data competation/train/001.Black_footed_Albatross/Black_Footed_Albatross_0023_796059.jpg...
Processing /Users/jingyany/Desktop/17Spring/Data558/data competation/train/001.Black_footed_Albatross/Black_Footed_Albatross_0024_796089.jpg...
Processing /Users/jingyany/Desktop/17Spring/Data558/data competation/train/001.Black_footed_Albatross/Black_Footed_Albatross_0025_796057.jpg...


In [76]:
labels1 = np.asarray(labels1)
#labels1 = redefine_lables(labels1)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(features1, labels1, test_size=0.2, random_state=42)

In [78]:
svm1 = LinearSVC().fit(X_train, y_train)

In [80]:
y_pred = svm1.predict(X_test)
y_pred

array([ 1,  1, 25, 25,  1, 25, 25, 25,  1, 25, 25, 25])

In [81]:
y_test

array([ 1,  1, 25, 25,  1, 25, 25, 25,  1, 25, 25, 25])

- In a one-vs-rest fashion, for each class, train a linear SVM classifier using scikit-learn's function LinearSVC, with the default value for $\lambda_c$. Compute the multi-class misclassification error obtained using these classiers trained in a one-vs-rest fashion.

- Using the option multi class='crammer singer' in scikitlearn's function LinearSVC, train a multi-class linear SVM classifier using the default value for the regularization parameter. Compute the multi-class misclassification error obtained using this multi-class linear SVM classifier.

### Linear SVMs for multi-class classification

- Redo all questions above now tuning the regularization parameters using cross-validation.

### Kernel SVMs for multi-class classification

- Redo all questions above now using the polynomial kernel of order 2 (and tuning the regularization parameters using cross-validation).