In [None]:
import tensorflow as tf
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

In [None]:
def get_data(features_set, labels_set):
    group_element_num = 100
    
    label_to_features = {}
    label_to_indexes = {}
    for index, label in enumerate(labels_set):
        label = tuple(label)
        if label not in label_to_features:
            label_to_features[label] = [features_set[index]]
            label_to_indexes[label] = [index]
        else:
            label_to_features[label].append(features_set[index])
            label_to_indexes[label].append(index)

    labels = []
    img_features = []
    indexes = []
    for l in label_to_features.keys():
        if len(label_to_features[l]) % group_element_num == 0:
            split = len(label_to_features[l])/group_element_num
        else:
            split = len(label_to_features[l])/group_element_num + 1
        for i in range(split):
            labels.append(l)
            # if it's the last part
            if i == split-1:
                img_features.append(np.asarray(label_to_features[l][i*group_element_num:]))
                indexes.append(np.asarray(label_to_indexes[l][i*group_element_num:]))
            # split into parts, each with 'group_element_num' img featrues
            else:
                img_features.append(np.asarray(label_to_features[l][i*group_element_num:(i+1)*group_element_num]))
                indexes.append(np.asarray(label_to_indexes[l][i*group_element_num:(i+1)*group_element_num]))

    labels = np.asarray(labels)
    img_features = np.asarray(img_features)
    indexes = np.asarray(indexes)
    
    return shuffle(img_features, labels, indexes)

In [None]:
def get_biz_features(img_features, labels, indexes, num_cluster, kmn):
    biz_features = np.zeros([len(labels),(1024*num_cluster)])

    for idx, features in enumerate(img_features):
        feature_index = indexes[idx]
        cluster_lable = np.array(kmn[feature_index])
        
        # for each biz group, mean feature vectore for those in the same cluster
        for kn in range(num_cluster):
            x = features[cluster_lable==kn]
            # if feature doesn't belong to any cluster,
            # which is impossible....
            if(len(x) == 0):    
                biz_features[idx,(1024*(kn)):(1024*(kn+1))] = np.zeros([1,1024])
            else:
                x = np.mean(x,axis=0)
                x = x.reshape([1,1024])
                biz_features[idx,(1024*(kn)):(1024*(kn+1))] = x

    return biz_features

In [None]:
def one_vs_rest_train_test(train_X, train_y, test_X, test_y):
    classifier = OneVsRestClassifier(LinearSVC(loss='hinge'))
    classifier.fit(train_X, train_y)
    return classifier.score(test_X, test_y)

In [None]:
# please change to your directory for the .npy files here
input_features = np.load('./features_234000.npy')
input_labels = np.load('./features_labels_234000.npy')

In [None]:
for num_cluster in [2,3,4,5]:
    print "num_cluster = " + str(num_cluster)
    # shuffle input data
    input_features, input_labels = shuffle(input_features, input_labels)
    kmn_holder = MiniBatchKMeans(n_clusters=num_cluster)
    kmn = kmn_holder.fit_predict(input_features, input_labels)
    kmn_train = kmn[:200000]
    kmn_test = kmn[200000:-1000]
    train_data = input_features[:200000]
    train_label = input_labels[:200000]
    test_data = input_features[200000:-1000]
    test_label = input_labels[200000:-1000]
    tr_img_features, tr_labels, tr_indexes = get_data(train_data, train_label)
    tr_biz_features = get_biz_features(tr_img_features, tr_labels, tr_indexes, num_cluster, kmn_train)
    ts_img_features, ts_labels, ts_indexes = get_data(test_data,test_label)
    ts_biz_features = get_biz_features(ts_img_features, ts_labels, ts_indexes, num_cluster, kmn_test)
    score = one_vs_rest_train_test(tr_biz_features, tr_labels, ts_biz_features, ts_labels)
    print("score = " + str(score))