In [None]:
import tensorflow as tf
import numpy as np
from keras.models import Model
from keras.layers import Dense
from keras.optimizers import SGD
import cv2
import skimage
import os
from imgaug.imgaug import augmenters as iaa
from densenet121 import DenseNet
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.cluster import MiniBatchKMeans

In [None]:
model = DenseNet(reduction=0.5, classes=1000, weights_path='./densenet121_weights_tf.h5')
print(model.layers[-1].output_shape)
model.layers.pop()
model.layers.pop()
output = model.layers[-1].output
new_model = Model(model.input, output)

In [None]:
def get_data(features_set, labels_set):
    print "get_data"
    group_element_num = 100
    
    label_to_features = {}
    label_to_indexes = {}
    for index, label in enumerate(labels_set):
        label = tuple(label)
        if label not in label_to_features:
            label_to_features[label] = [features_set[index]]
            label_to_indexes[label] = [index]
        else:
            label_to_features[label].append(features_set[index])
            label_to_indexes[label].append(index)

    labels = []
    img_features = []
    indexes = []
    for l in label_to_features.keys():
        if len(label_to_features[l]) % group_element_num == 0:
            split = len(label_to_features[l])/group_element_num
        else:
            split = len(label_to_features[l])/group_element_num + 1
        for i in range(split):
            labels.append(l)
            # if it's the last part
            if i == split-1:
                img_features.append(np.asarray(label_to_features[l][i*group_element_num:]))
                indexes.append(np.asarray(label_to_indexes[l][i*group_element_num:]))
            # split into parts, each with 'group_element_num' img featrues
            else:
                img_features.append(np.asarray(label_to_features[l][i*group_element_num:(i+1)*group_element_num]))
                indexes.append(np.asarray(label_to_indexes[l][i*group_element_num:(i+1)*group_element_num]))

    labels = np.asarray(labels)
    img_features = np.asarray(img_features)
    indexes = np.asarray(indexes)
    
#     return my_shuffle([img_features, labels, indexes])
    return img_features, labels, indexes

In [None]:
def train(iteration, train_X, train_y):
    print "training"
    # classifier = OneVsRestClassifier(LinearSVC(random_state=0))
    lst_clfs = [LinearSVC() for i in range(9)]
    for i in range(iteration):
        print "Iteration: " + str(i)
        for k in range(9):
            clf = lst_clfs[k]
            part = train_X.shape[0]/5
            scores = []
            for fold in range(5):
                val_X = train_X[fold*part:(fold+1)*part]
                val_y = train_y[fold*part:(fold+1)*part]
                tr_X = np.vstack((train_X[0:fold*part], train_X[(fold+1)*part:]))
                tr_y = np.vstack((train_y[0:fold*part], train_y[(fold+1)*part:]))

                clf.fit(tr_X, tr_y[:, k])
                val_score = clf.score(val_X, val_y[:, k])
                scores.append(val_score)
            print("Training accuracy: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2))
            
    return lst_clfs

In [None]:
def test(lst_clfs, test_X, test_y):
    for k in range(9):
        clf = lst_clfs[k]
        s = clf.score(test_X, test_y[:,k])
        print str(k) + "'s classifier, test acc = " + str(round(s, 4))
    

In [None]:
def get_biz_features(img_features, labels, indexes, num_cluster, kmn):
    biz_features = np.zeros([len(labels),(1024*num_cluster)])

    for idx, features in enumerate(img_features):
        feature_index = indexes[idx]
        cluster_lable = np.array(kmn[feature_index])
        
        # for each biz group, mean feature vectore for those in the same cluster
        for kn in range(num_cluster):
            x = features[cluster_lable==kn]
            # if feature doesn't belong to any cluster,
            # which is impossible....
            if(len(x) == 0):    
                biz_features[idx,(1024*(kn)):(1024*(kn+1))] = np.zeros([1,1024])
            else:
                x = np.mean(x,axis=0)
                x = x.reshape([1,1024])
                biz_features[idx,(1024*(kn)):(1024*(kn+1))] = x

    return biz_features

In [None]:
def my_shuffle(arr_list):
    s_index = np.arange(len(arr_list[0]))
    np.random.shuffle(s_index)
    return_lst = []
    for arr in arr_list:
        return_lst.append(arr[s_index])
    return tuple(return_lst)

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
def one_vs_rest_train_test(train_X, train_y):
    classifier = OneVsRestClassifier(LinearSVC(loss='hinge'))
    classifier.fit(train_X, train_y)
    return classifier

In [None]:
input_features = np.load('/home/rendaxuan/Documents/workspace/4032/features_234000.npy')
input_labels = np.load('/home/rendaxuan/Documents/workspace/4032/features_labels_234000.npy')

In [None]:
for num_cluster in [4]:
    print "num_cluster = " + str(num_cluster)
    # shuffle input data
    input_features, input_labels = my_shuffle([input_features, input_labels])
    kmn_holder = MiniBatchKMeans(n_clusters=num_cluster)
    kmn = kmn_holder.fit_predict(input_features, input_labels)
    kmn_train = kmn[:1880000]
    kmn_test = kmn[188000:-1000]
    train_data = input_features[:188000]
    train_label = input_labels[:188000]
    test_data = input_features[188000:-1000]
    test_label = input_labels[188000:-1000]
    tr_img_features, tr_labels, tr_indexes = get_data(train_data, train_label)
    tr_biz_features = get_biz_features(tr_img_features, tr_labels, tr_indexes, num_cluster, kmn_train)
    ts_img_features, ts_labels, ts_indexes = get_data(test_data,test_label)
    ts_biz_features = get_biz_features(ts_img_features, ts_labels, ts_indexes, num_cluster, kmn_test)
    classifier = one_vs_rest_train_test(tr_biz_features, tr_labels)

In [None]:
test_data = input_features[-3000:-1000]
test_label = input_labels[-3000:-1000]
ts_img_features, ts_labels, ts_indexes = my_shuffle(get_data(test_data,test_label))
ts_biz_features = get_biz_features(ts_img_features, ts_labels, ts_indexes, num_cluster, kmn_test)

In [None]:
predict = classifier.predict(ts_biz_features)

In [None]:
count = 0 
for i in range(predict.shape[0]): 
    if np.array_equal(predict[i], ts_labels[i]): 
        count += 1

In [None]:
count/float(predict.shape[0])

In [None]:
ele_predict = classifier.score(ts_biz_features, ts_labels)

In [None]:
ele_predict

In [None]:
# business features
def get_img_features(images):
    biz_features = []
    for i, img_list in enumerate(images):
        pred = new_model.predict(img_list)
        img_features.append(np.mean(pred, axis=0))
    return np.asarray(biz_features)

In [None]:
 ''' get dictionary of biz_ids and all the corresponding photos '''
def biz_id_to_photo(file_name):
    with open(file_name,"r") as f:
        lines = f.readlines()[1:]
    biz_to_photo = {}
    for line in lines:
        if line.split(",")[1].rstrip() not in biz_to_photo.keys():
            biz_to_photo[line.split(",")[1].rstrip()] = [line.split(",")[0]]
        else:
            biz_to_photo[line.split(",")[1].rstrip()].append(line.split(",")[0])
    return biz_to_photo

In [None]:
test_photo_to_biz_df = pd.read_csv('./test_photo_to_biz.csv')

In [None]:
test_img_features = np.load('./features_test.npy')

In [None]:
photo_ids = []
data_dir = './test_photos/'
paths = [os.path.join(data_dir,i) for i in os.listdir(data_dir) if i.endswith('.jpg') and not i.startswith("._")]
for path in paths:
    photo_ids.append(int(path.replace(data_dir,"").replace(".jpg","")))

In [None]:
biz_img_features_df = pd.DataFrame(columns=["business_id", "img_features"])
biz_img_features = {}
counter = 0
for index, row in test_photo_to_biz_df.iterrows():
    if photo_ids.index(row['photo_id']) >= len(test_img_features):
        continue
    img_feature = test_img_features[photo_ids.index(row['photo_id'])]
    if not row['business_id'] in biz_img_features.keys():
        biz_img_features[row['business_id']] = [img_feature]
    else:
        biz_img_features[row['business_id']].append(img_feature)
    counter+=1
    if counter % 500 == 0:
        print counter

In [None]:
test_features = biz_img_features.values()[0]
test_features = np.vstack([np.vstack(biz_img_features[i]) for i in biz_img_features.keys()])

In [None]:
for num_cluster in [2,3,4,5]:
    print "num_cluster = " + str(num_cluster)
    input_features, input_labels = my_shuffle([input_features, input_labels])
    kmn_holder = MiniBatchKMeans(n_clusters=num_cluster)
    kmn = kmn_holder.fit_predict(input_features, input_labels)
    kmn_train = kmn[:]
    kmn_test_holder = MiniBatchKMeans(n_clusters=num_cluster)
    kmn_test = kmn_test_holder.fit_predict(test_features) 
    train_data = input_features
    train_label = input_labels
    test_data = test_features
    test_label = biz_img_features.keys()
    tr_img_features, tr_labels, tr_indexes = get_data(train_data, train_label)
    tr_biz_features = get_biz_features(tr_img_features, tr_labels, tr_indexes, num_cluster, kmn_train)
    ts_img_features, ts_labels, ts_indexes = get_data(test_data,test_label)
    ts_biz_features = get_biz_features(ts_img_features, ts_labels, ts_indexes, num_cluster, kmn_test)   
    classifier = one_vs_rest_train_test(tr_biz_features, tr_labels)
    predict = classifier.predict(tr_biz_features[:20])
#     break

In [None]:
# 0: good_for_lunch
# 1: good_for_dinner
# 2: takes_reservations
# 3: outdoor_seating
# 4: restaurant_is_expensive
# 5: has_alcohol
# 6: has_table_service
# 7: ambience_is_classy
# 8: good_for_kids

In [None]:
predict[10:20]

In [None]:
tr_labels[10:20]

In [None]:
tr_indexes[19]

In [None]:
tr_indexes.shape