In [48]:
import numpy as np
import h5py
import scipy.io
import pickle
from sklearn.linear_model import LinearRegression
from ranking import *
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge as KR

In [25]:
#Can change NuSVR to other types
#Matrix regression
from sklearn.svm import NuSVR as SVR
class kernel_svm:
    def __init__(self):
        self.x=0
        self.y=0
        self.s=0
        self.x_par=0 #Number of x parameters
        self.y_par=0 #Number of x parameters
        
    def fit(self,x,y,kernel='linear',C=1e3,gamma=0.1,degree=2,coef0=1):
        self.x = x
        self.y = y
        self.x_par = x.shape[1]
        try:
            self.y_par = y.shape[1] 
        except:
            self.y_par = 1
                
        self.s = [ SVR(kernel=kernel, C=C, gamma=gamma, degree=degree, coef0=coef0)] * self.y_par
        for i in range(0, self.y_par):
            if self.y_par == 1:
                self.s[i].fit(x,y)
            else:
                self.s[i].fit(x, y[:,i])
    
    def predict(self,x):
        y = np.zeros([len(x), self.y_par])
        for i in range(0, self.y_par):
            y[:,i] = self.s[i].predict(x)
        return y
    
    def error(self,y1,y2):
        return 1.0/y1.size * np.linalg.norm(y1-y2)

In [26]:
def MSE(a,b):
    mse =0.0
    print(a.shape, b.shape)
    if a.shape != b.shape:
        print("Size of vector mixmatch - cannot calculate Mean Squared error")
    for i in range(0,len(a)):
        mse += np.linalg.norm(b[i] - a[i])**2
    mse = 1.0/len(a)*mse
    return mse


In [5]:
#Evaluation metric
def F1_score(tags_pred, tags_actual,k1=3, k2=5):
    k1_count=0.0
    k2_count=0.0
    for i in range(0,k1):
        if tags_actual[tags_pred[i]] == 1:
            k1_count += 1

    for i in range(0,k2):
        if tags_actual[tags_pred[i]] == 1:
            k2_count += 1

    num_tags = 1.0*(len(tag_word_vectors) + sum(tags_actual ))/2
    
    k1_recall = 1.0*k1_count/num_tags
    k2_recall = 1.0*k2_count/num_tags 
    k1_precision = k1_count/k1
    k2_precision = k2_count/k2
    
    #print('--->' + str(k1_precision) + " : " + str(k2_precision))
    tmp1 = 2.0*k1_precision*k1_recall/(k1_precision + k1_recall)
    tmp2 = 2.0*k2_precision*k2_recall/(k2_precision+k2_recall)
    if tmp1 >= 0 and tmp2 >= 0:
        return [tmp1,tmp2]
    elif tmp1 >= 0:
        return [tmp1,0]
    elif tmp2 >= 0:
        return [0,tmp2]
    else:
        return [0,0]


In [6]:
def loadData(wvToUse = 'original'):
    print("Loading Data")
    global training_data, training_label, valid_data, valid_label, testing_data, testing_label
    global n_training, n_testing, n_valid
    global w_list, tag_word_vectors, len_tag_wv
    
    f = h5py.File('l2_normalized_semantic_SVM_full_data_with_val_291labels_no_zero.mat' )
    training_data = np.transpose(f["prepared_training_data"])
    training_label = np.transpose(f["prepared_training_label"])
    valid_data = np.transpose(f["prepared_val_data"])
    valid_label = np.transpose(f["prepared_val_label"])
    testing_data = np.transpose(f["prepared_testing_data"])
    testing_label = np.transpose(f["prepared_testing_label"])
    
    n_training = len(training_data)
    n_valid = len(valid_data)
    n_testing = len(testing_data)

    if wvToUse == 'original':
        tag_word_vectors = np.transpose(h5py.File('291labels.mat')["semantic_mat"])
    elif wvToUse == 'embeddings':
        tag_word_vectors = scipy.io.loadmat('embeddings.mat')["U"]

    len_tag_wv =tag_word_vectors.shape[1]
    
    print("Done")
    return training_data, training_label, valid_data, valid_label, testing_data, testing_label, n_training, n_testing, n_valid, tag_word_vectors, len_tag_wv

In [7]:
def modifyData(n_training_local=0, n_testing_local=0, n_valid_local=0):
    if n_training == 0 or n_testing == 0 or n_valid == 0:
        print("0 value entries not allowed")
    
    global training_data, training_label, valid_data, valid_label, testing_data, testing_label
    global n_training, n_testing, n_valid
    global w_list
    
    if n_training_local != 0:
        try:
            training_data = training_data[0:n_training_local]
            training_label = training_label[0:n_training_local]
        except:
            print('index (n_training) out of range, ignoring n_training for training')
            
        try:
            valid_data = valid_data[0:n_valid_local]
            valid_label = valid_label[0:n_valid_local]
        except:
            print('index (n_testing) out of range, ignoring n_testing for validation')
            
        try:
            testing_data = testing_data[0:n_testing_local]
            testing_label = testing_label[0:n_testing_local]
        except:
            print('index (n_valid) out of range, ignoring n_valid for test')
    n_training = len(training_data)
    n_valid = len(valid_data)
    n_testing = len(testing_data)

    #tag_word_vectors = np.transpose(h5py.File('291labels.mat')["semantic_mat"])

    print("Done")
    return training_data, training_label, valid_data, valid_label, testing_data, testing_label, n_training, n_testing, n_valid#, tag_word_vectors

  global n_training, n_testing, n_valid
  global n_training, n_testing, n_valid
  global n_training, n_testing, n_valid


In [8]:
def rankSVM():
    global w_list
    print("Ranking SVM for Training Data")
    r=RankSVM()
    w_list=np.zeros([n_training,len_tag_wv])
    for i in range(0,len(training_data)):
        r.fit(tag_word_vectors,training_label[i])
        w_list[i] = r.coef_
    print('Done')
    return w_list

In [9]:
def fitLinearRegression():
    global A
    print("Fitting Linear Regression model")
    lin_reg = LinearRegression(normalize=True)
    lin_reg.fit(training_data, w_list)
    print(lin_reg.score(training_data, w_list))
    A = lin_reg.coef_
    #print(w_list.shape," = ", A.shape, training_data.shape)
    print("Done")
    return A

In [59]:
def fitKernelRegression(method = 'ridge', kernel = 'poly', degree = '50000', gamma=0.5):
    global s
    
    #print("Fitting Support Vector Kernelized Regression model : " + method + " ; Kernel : " + kernel)
    
    if method == 'ridge':
        s=KR(kernel = kernel,degree=50000,gamma=gamma)
        s.fit(training_data,w_list)
        print("Ridge training score : " + str(s.score(training_data,w_list)) + " ... 1 best possible, <0 very bad")
    elif method == 'SVR':
        s = kernel_svm()
        s.fit(training_data, w_list,'poly',C=875,gamma=0.0001,degree=1,coef0=1)
        #w = s.predict(training_data)
        #print(s.error(w, w_list))
    #print("Done")
    return s

In [61]:
def printAccuracies(fitType = 'lin', n1 = 10, n2 = 10, n3 = 10, k=5): #fitType = linear/kernelized
    if fitType != 'lin' and fitType != 'kernel':
        print('Invalid fitType, fitType should be lin or kernel')
        return
    #Accuracy for Training Data
    #r=RankSVM()
    avg1 = 0
    avg2 = 0

    a = np.zeros(291)#Number of images fot which the tag (ith tag) is predicted right
    b = np.zeros(291)#Total number of occurences of the tags
    c = np.zeros(291)#Total number of occurences of the predicted tags
    
    for ii in range(0,n1):
        j=int(np.floor(np.random.rand(1)*n_training))
        if fitType == 'lin':#Linear Regression
            w = np.dot(training_data[j], np.transpose(A))
        else:#Kernelized Support vector Regression
            w = s.predict([training_data[j]]).reshape((len_tag_wv,))
            
        tags_pred_score = np.dot(w,np.transpose(tag_word_vectors))
        tag_pred_ranked = [i[0] for i in sorted(enumerate(tags_pred_score), key=lambda x:x[1])]
        tag_pred_ranked.reverse()
        
        #--------------------------------
        for l in range(0,k):
            c[tag_pred_ranked[l]] += 1
            if training_label[j][tag_pred_ranked[l]] == 1:
                a[tag_pred_ranked[l]] += 1
            
        for l in range(0,291):
            if training_label[j][l] == 1:
                b[l] += 1
        
        #--------------------------------
        
        [tmp1, tmp2] = F1_score(tag_pred_ranked, training_label[j],3,5)
        #print('TRAINING : ' + str(tmp1) + " : " + str(tmp2))
        avg1 += tmp1
        avg2 += tmp2
        
    bvg = 0
    b0cnt = 0
    for i in range(0,291):
        if b[i] != 0 and c[i]!=0 and a[i] != 0:
            pre=a[i]/c[i]
            rec=a[i]/b[i]
            bvg += 2*pre*rec/(pre+rec)
        elif b[i] == 0:
            b0cnt += 1
        
    #print("TRAIN   : " + str(avg1/n1) + " : "+ str(avg2/n1))
    #print("TRAIN 2 : " + str(bvg/(291-b0cnt)))
    
    
    #Accuracy for Validation Data
    avg1 =0 
    avg2 = 0
    for ii in range(0,n2):
        j=int(np.floor(np.random.rand(1)*n_valid))
        if fitType == 'lin': #Linear Regression
            w = np.dot(valid_data[j], np.transpose(A))
        else:  #Kernelized Support vector/Ridge Regression
            w = s.predict([valid_data[j]]).reshape((len_tag_wv,))
         
        #w = np.dot(valid_data[j], np.transpose(A))
        tags_pred_score = np.dot(w,np.transpose(tag_word_vectors)) 
        tag_pred_ranked = [i[0] for i in sorted(enumerate(tags_pred_score), key=lambda x:x[1])]
        tag_pred_ranked.reverse()

        [tmp1, tmp2] = F1_score(tag_pred_ranked, valid_label[j],3,5)

        avg1 += tmp1
        avg2 += tmp2

    #print("VALID : " + str(avg1/n2) + " : "+ str(avg2/n2))
    
    #Accuracy for Testing Data
    avg1 =0 
    avg2 = 0
    a = np.zeros(291)#Number of images fot which the tag (ith tag) is predicted right
    b = np.zeros(291)#Total number of occurences of the tags
    c = np.zeros(291)#Total number of occurences of the predicted tags
    
    for ii in range(0,n3):
        j=int(np.floor(np.random.rand(1)*n_testing))
        if fitType == 'lin':#Linear Regression
            w = np.dot(testing_data[j], np.transpose(A))
        else:#Kernelized Support vector Regression
            w = s.predict([testing_data[j]]).reshape((len_tag_wv,))
        
        tags_pred_score = np.dot(w,np.transpose(tag_word_vectors)) 
        tag_pred_ranked = [i[0] for i in sorted(enumerate(tags_pred_score), key=lambda x:x[1])]
        tag_pred_ranked.reverse()
        
        #--------------------------------
        for l in range(0,k):
            c[tag_pred_ranked[l]] += 1
            if testing_label[j][tag_pred_ranked[l]] == 1:
                a[tag_pred_ranked[l]] += 1
            
        for l in range(0,291):
            if testing_label[j][l] == 1:
                b[l] += 1
        
        #--------------------------------
        [tmp1, tmp2] = F1_score(tag_pred_ranked, testing_label[j],3,5)
        #print("TESTING : " + str(tmp1) + " : " + str(tmp2))
        avg1 += tmp1
        avg2 += tmp2
        
        bvg = 0
        b0 = 0
        for i in range(0,291):
            #print(a[i],b[i],c[i])
            if b[i] != 0 and c[i]!=0 and a[i] != 0:
                pre=a[i]/c[i]
                rec=a[i]/b[i]
                bvg += 2*pre*rec/(pre+rec)
            if b[i] == 0:
                b0 += 1
        
    #print("TEST  : " + str(avg1/n3) + " : "+ str(avg2/n3))
    print(str(gamma)+"\t"+str(bvg/(291-b0)))

In [81]:
#print('LINEAR')
#printAccuracies('lin',10,10,1000)
#print('k=3')
#for gamma in np.arange(0.3,0.5,0.01):
#    fitKernelRegression('rbf',gamma = gamma);
    #print('_________________________________________________________\n\n')
    #print('gamma : ' + str(gamma)+'\n')
#    printAccuracies('kernel',1,1, n_testing,3)

print('\nk=5')

fitKernelRegression('rbf',gamma = 0.5);
printAccuracies('kernel',1,1,n_testing,5)



k=5




0.08	0.293917223908


In [80]:
fitKernelRegression('rbf',gamma=0.08)
printAccuracies('kernel',1,1,1000,5)



0.08	0.289029943428


In [49]:
loadData('original');

Loading Data
Done


In [78]:
model = pickle.load(open('model.pkl'))

In [72]:
rankSVM();

Ranking SVM for Training Data
Done


In [None]:
tag_word_vectors=model['tag_word_vectors']
A=model['linear_model']
s=model['kernel_model']
w_list=model['w_list']

In [21]:
fitLinearRegression();

Fitting Linear Regression model


NameError: global name 'w_list' is not defined

In [38]:
fitKernelRegression('rbf',gamma =0.1);

Fitting Support Vector Kernelized Regression model : rbf ; Kernel : poly
Done


In [70]:
model = {}
model['tag_word_vectors'] = tag_word_vectors
model['linear_model'] = A
model['kernel_model'] = s
model['w_list'] = w_list
modelFile = open('model_50000.pkl', "w")
pickle.dump(model,modelFile)