In [283]:
#get SIFT points, cluster the data

from sklearn.cluster import k_means

def cluster_data(features, k, nr_iter=50):
    #Just a wrapper about sklearn
    centroids = k_means(features, n_clusters=k, max_iter=nr_iter)[0]
    # Return the centroids
    return centroids

# Define the number of clusters.
nr_clusters = 100


# Load the file containing the training images.
trainimages = [line.strip().split(" ")[0] for line in open("trainset-overview.txt", "r")]
print "There are", len(trainimages), "training images"

trainpoints = []

for i in xrange(len(trainimages)):   
    # Extract point locations from the image using your selected point method and parameters.
    image_name = trainimages[i]

    densePoints = sift.densePoints(image_name, stride=25)
    sigma       = 1.0
    hesPoints   = sift.computeHes(image_name, sigma, magThreshold=15, hesThreshold=10, NMSneighborhood=10)
    harPoints   = sift.computeHar(image_name, sigma, magThreshold=5, NMSneighborhood=10)

    # Compute the SIFT features.
    allpoints = np.concatenate((densepoints, hespoints, harpoints))
    
    point1, sift1 = sift.computeSIFTofPoints(image_name, allpoints, sigma, nrOrientBins=8, nrSpatBins=4, nrPixPerBin=4)
    
    trainpoints.append(sift1)

trainsift = np.vstack(trainpoints)
    
    
# Cluster the SIFT features and put them in a matrix with the name 'clusters'!

print "Clustering..."
clusters = cluster_data(trainsift, nr_clusters)
print clusters.shape

There are 1000 training images
Clustering...
(100, 128)


In [284]:
#make bag-of-words histogram of all the images

from sklearn.preprocessing import normalize

# Length of histogram vector.
size_of_histograms = 50

train_feat = np.zeros((len(trainimages), size_of_histograms))

# Go through the SIFTs of every image and create a histogram for the image
# relative to the clusters you discovered in the previous phase.

def euclidean_distance(x, y):
    assert(len(x) == len(y))
    d = 0.0
    
    nr_dimensions = len(x)
        
    d = np.sum((x-y)**2)
    
    d = np.sqrt(d)
    
    return d

def distances(a,X,distance_fn=euclidean_distance):
    dists = np.zeros(X.shape[0])
    
    for i,v in enumerate(X): 
        dstnc = distance_fn(a,v)  
        dists[i] = dstnc
            
    return dists

def cluster_assignment(samples, clusters):
    nr_samples = samples.shape[0]
    
    nr_clusters = clusters.shape[0]
    
    assignments  = np.zeros(nr_samples, dtype=int)
    
    for i in range(samples.shape[0]):        
        assignments[i] = np.argmin(distances(samples[i], clusters))

    
    return assignments


def create_histogram(samples, clusters):
    # Perform the assignments first.
    assignments = cluster_assignment(samples,clusters)
    
    # Initialize the histogram.
    histogram   = np.zeros(clusters.shape[0], dtype=np.float)
    
    # Go over all the assignments and place them in the correct histogram bin.     
    for assignment in assignments: 
        histogram[assignment] +=1
   
    # Normalize the histogram such that the sum of the bins is equal to 1.
    histogram = normalize(histogram.reshape(1,-1), norm='l1')[0]
    
    return histogram


def images_histogram(list_of_images, clusters):

    image_sifts = []

    #len(list_of_images)
    for i in xrange(len(list_of_images)):
    
        image_name = list_of_images[i]
        
        densePoints = sift.densePoints(image_name, stride=5)

        sigma       = 1.0
        hesPoints   = sift.computeHes(image_name, sigma, magThreshold=15, hesThreshold=10, NMSneighborhood=10)

        harPoints   = sift.computeHar(image_name, sigma, magThreshold=10, NMSneighborhood=10)

        allpoints = np.concatenate((densepoints, hespoints, harpoints))
        
        point1, sift1 = sift.computeSIFTofPoints(image_name, allpoints, sigma, nrOrientBins=8, nrSpatBins=4, nrPixPerBin=4) 
        
        image_cluster_sifts = create_histogram(sift1, clusters)
        
        image_sifts.append(image_cluster_sifts)

    image_sifts = np.vstack(image_sifts)
    
    return image_sifts

image_sifts = images_histogram(trainimages, clusters)

print image_sifts.shape

(1000, 100)


In [285]:
#next few cells: train the classifier

from sklearn.svm import LinearSVC

#Training ground truth labels
train_labels = np.array([int(line.strip().split(" ")[1]) for line in open("trainset-overview.txt", "r")])

#Validation images
valimages = [line.split(' ')[0] for line in open('valset-overview.txt','r')]

#Validation ground truth labels
val_labels = np.array([int(line.rstrip().split(' ')[1]) for line in open('valset-overview.txt','r')])

#To do by you:
#Calculate the histogram representations for the validation images

histo_val_img = images_histogram(valimages, clusters)

In [286]:
#SVM classifier: find best C value

import numpy as np

def validate_svm_parameter(trainx, trainy, valx, valy):
    cs = np.logspace(-2,+2,5)
    bestc = 0
    scores = []
    
    # To do by you.
    
    for element in range(0,len(cs)):
        clf = LinearSVC(C = cs[element])
        clf.fit(trainx, trainy)
        trainx.shape, trainy.shape
        score = clf.score(valx, valy)
        scores.append(score)
    
    max_val = np.argmax(scores)
    bestc = cs[max_val]
    
    # Return best c value.
    return bestc
        
# Call the function.
bestc = validate_svm_parameter(image_sifts, train_labels, histo_val_img, val_labels)

print bestc

10.0


In [287]:
#SVM classifier: train classifier, find accuracy (score)

clf = LinearSVC(C=10)
clf.fit(image_sifts, train_labels)

print clf 


score = clf.score(histo_val_img, val_labels)

print score

LinearSVC(C=10, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.37


In [288]:
#KNN classifier: Euclidean distance, then predict labels of validation set

from collections import Counter

def knn(data_point,train_data,train_label,k):

    d = distances(data_point, train_data)
    labels = train_label[np.argsort(d)[0:k]]
    prediction = Counter(labels).most_common()[0][0]
    return prediction

predicted_labels = []

for i in xrange(len(histo_val_img)):
    predicted_labels.append(knn(histo_val_img[i], image_sifts, train_labels, 50))

print predicted_labels


[6, 0, 0, 0, 5, 0, 0, 9, 5, 6, 0, 0, 3, 0, 0, 6, 5, 0, 6, 6, 9, 1, 7, 3, 2, 8, 1, 7, 7, 1, 4, 6, 3, 8, 1, 4, 1, 4, 1, 7, 2, 4, 8, 4, 7, 5, 1, 6, 9, 4, 4, 2, 4, 4, 9, 4, 7, 3, 7, 2, 3, 3, 7, 1, 3, 3, 7, 6, 3, 4, 4, 4, 4, 7, 6, 3, 8, 3, 3, 1, 4, 4, 7, 1, 3, 3, 4, 7, 4, 4, 3, 4, 3, 3, 4, 7, 6, 6, 1, 2, 5, 9, 0, 6, 5, 5, 3, 6, 5, 5, 6, 6, 0, 7, 3, 5, 6, 3, 9, 0, 6, 8, 6, 2, 8, 6, 6, 3, 6, 1, 0, 3, 4, 1, 6, 5, 1, 1, 0, 5, 1, 9, 7, 3, 7, 7, 9, 7, 1, 7, 7, 7, 1, 3, 7, 1, 4, 7, 7, 9, 3, 4, 4, 1, 3, 4, 4, 1, 7, 8, 4, 4, 8, 9, 3, 2, 3, 8, 3, 4, 9, 9, 1, 9, 9, 9, 5, 3, 7, 9, 5, 5, 9, 9, 6, 9, 7, 9, 3, 9]


In [289]:
#Decision tree classifier, and validation prediction/ score. Call all tree/ forest/ ada classifiers from sklearn

from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier


dtree = DecisionTreeClassifier(max_depth=20, max_features=100)
dtree = dtree.fit(image_sifts, train_labels)

print dtree

decisionscore = dtree.score(histo_val_img, val_labels) # score: 0.245

print decisionscore

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features=100, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
0.255


In [290]:
#Random forest classifier, r.forest prediction

rtree = RandomForestClassifier(max_depth=20, n_estimators=300)
rtree = rtree.fit(image_sifts, train_labels)

print rtree

randomscore = rtree.score(histo_val_img, val_labels) #score: .4

print randomscore

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.365


In [291]:
#adaboost classifier, adaboost prediction

ada = AdaBoostClassifier(learning_rate=0.25, n_estimators=1000)
ada = ada.fit(image_sifts, train_labels)

print ada

score = ada.score(histo_val_img, val_labels)

print score

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.25, n_estimators=1000, random_state=None)
0.26


<h3>Phase 5: Predicting test images and uploading the results</h3>
<p>
Once you have a model that works well on the validation set, you can submit your predictions on the test set. We provide you with the image filename list as well as code to save your predictions as a csv so you can submit to Kaggle. 

In [292]:
#Predict test images; upload the results

testimages = [line.strip().split(' ')[0] for line in open('testset-overview-final.txt','r')]

test_img_histo = images_histogram(testimages, clusters)

#Use classifier to make class predictions:
test_predictions = np.zeros(len(testimages))

In [293]:
#KNN test predictions
for i in xrange(len(test_img_histo)):
    test_predictions[i] = (knn(test_img_histo[i], image_sifts, train_labels, 50))

In [295]:
#Decision tree test predictions
test_predictions = dtree.predict(test_img_histo)

In [297]:
#RandomForest test predictions
test_predictions = rtree.predict(test_img_histo)

In [299]:
#AdaBoost test predictions
test_predictions = ada.predict(test_img_histo)


In [300]:
#SVM test predictions
cs = np.logspace(-2,+2,5)

for element in range(0,len(cs)):
        clf = LinearSVC(C = cs[element])
        clf.fit(image_sifts, train_labels)
        image_sifts.shape, train_labels.shape
        test_predictions = clf.predict(test_img_histo)
   
test_predictions = clf.predict(test_img_histo)
    
print test_predictions

#We save your predictions to file
test_p_file = open('test_predictions_bad.csv','w')
test_p_file.write('ImageName,Prediction\n')
for i,image in enumerate(testimages):
    test_p_file.write(image+','+str(int(test_predictions[i]))+'\n')
test_p_file.close()

[9 9 6 1 3 0 4 4 1 4 2 4 6 0 6 0 1 9 0 7 2 0 9 9 2 0 1 5 4 1 5 9 9 2 8 5 0
 4 8 2 5 5 1 1 8 0 2 0 2 1 6 6 4 3 5 8 9 5 4 7 4 3 2 3 3 5 0 1 5 6 1 1 5 7
 0 9 7 4 2 9 3 3 2 4 9 1 9 2 6 2 9 9 2 4 2 9 0 0 8 1 4 5 1 3 5 2 6 8 2 6 4
 9 8 2 6 9 7 0 3 5 1 2 1 7 5 5 0 5 0 1 9 3 3 0 9 2 0 2 0 9 0 7 7 5 2 5 2 7
 7 0 8 6 2 7 0 4 4 2 1 0 4 3 4 9 1 3 0 7 1 5 2 1 6 2 1 7 2 0 4 8 0 3 6 3 0
 4 0 1 1 7 3 6 2 4 0 8 0 6 4 8 5 2 3 9 8 4 7 0 8 2 5 8 2 0 5 9 4 6 9 9 9 1
 8 5 1 1 0 3 0 8 0 4 1 1 0 6 1 3 6 4 7 2 3 6 6 3 1 5 9 4 1 9 6 4 1 5 2 5 1
 1 9 8 5 6 9 9 3 5 8 7 3 1 3 4 2 3 4 0 0 4 0 0 7 2 5 4 4 0 1 7 3 2 0 0 5 1
 4 4 5 3 1 5 9 3 4 4 2 2 7 9 2 5 6 0 2 9 9 2 0 3 9 0 7 1 1 0 1 9 4 0 6 9 7
 0 0 2 5 9 6 3 1 1 3 1 0 5 3 0 1 4 0 9 8 1 3 9 5 4 1 2 0 6 4 2 7 1 6 0 4 4
 0 0 6 4 1 0 2 9 5 8 0 1 0 3 6 1 5 9 4 7 1 4 3 5 9 0 0 4 9 1 5 0 2 6 2 1 4
 6 2 5 6 2 4 0 8 3 6 1 4 8 4 6 4 1 2 9 5 6 1 9 6 8 2 3 6 4 1 5 2 3 0 4 3 0
 7 5 9 2 4 8 2 2 1 2 5 0 2 1 1 6 4 8 2 6 2 2 9 9 4 8 9 8 6 9 2 4 0 0 1 4 9
 9 9 5 2 4 0 0 7 2 5 6 4 

Your predictions have been saved to a file called 'test_predictions.csv'. You can upload this through the Kaggle interface to make your submission.
<p><h3>Want to do better?</h3>
In order to get the best performance, you will  need to tune the parameters of SIFT, of the bag-of-words (number of clusters), and of your classifier.
<br>
<br>
If you think that you've exhausted these parameters but still want to perform better, you can look at a number of things:
<ul>
<li>What if I use other distance functions in my knn classifiers? Look e.g. at: the histogram intersection distance or Manhattan distance online.</li>
<li>Other classifiers: You are allowed to use other classifiers from sklearn (e.g. SVMs, Random Forests, AdaBoost, ...). You will need to figure out how these functions work and how to set the parameters (using the validation images). You should also be prepared to provide the motivation for any decisions in your report.</li>
<li>What about better bag-of-words representations (e.g. VLAD or Fisher Vector)?</li>
<li>What about deep networks?</li>
<li>Your own weird idea. If you have some idea in your head, try it out! We welcome those ideas. They do not even have to give a  performance boost, as long as you can explain the reasons behind your idea.</li>
</ul>