In [1]:
from __future__ import print_function
import grpc
import sgd_pb2_grpc
import sgd_pb2
from random import shuffle, randint
from collections import Counter
import operator
import threading as th
import queue

In [2]:
def zeros(x):
    '''
    Returns a vector (in list type) consisting of x zeros
    '''
    return [0]*x

def hinge_loss(y, X, w):
    '''
    Computes the Hinge loss given:
    y: label vector
    X: feature vector
    w: weight vector
    '''
    loss = zeros(len(y))
    f = multiply_matrix(X, w)
    for i in range(len(y)):
        loss[i] = max(1 - y[i] * f[i], 0)
    return loss

def multiply(x, w):
    '''
    Perfoms the dot product between the vector x and w
    '''
    y_n = 0
    for k in x:
        y_n += x.get(k) * w[k]
    return y_n

def multiply_matrix(X, w):
    '''
    Perfoms the matrix multiplication X@w
    '''
    y = zeros(len(X))
    for i in range(len(X)):
        y[i] = multiply(X[i], w)
    return y

def prediction(X, w):
    '''
    Return the prediction labels (± 1) given:
    X: feature matrix
    w: weight vector
    '''
    y = multiply_matrix(X, w)
    y = [a > 0 for a in y]
    return [(a*2 - 1) for a in y]

def accuracy(y_pred, y):
    '''
    Computes the accuracy given:
    y_pred: predicted labels
    y: test labels
    '''
    return sum([i == j for (i, j) in zip(y_pred, y)])/len(y)

def calculate_primal(y, X, w, lambda_):
    '''
    Computes the primal loss with the regularizer term lambda
    '''
    v = hinge_loss(y, X, w)
    return sum(v) + lambda_ / 2 * sum([w_i**2 for w_i in w])

def set_labels(cat, id_, id_to_labels):
    '''
    Converts the article ids to a label vector for SVM given a category 'cat'
    '''
    labels = [1 if cat in id_to_labels[x] else -1 for x in id_]
    return labels

def split_data(tx, ty, ratio, seed=1):
    '''
    Splits the training data by ratio (dedicated to training)
    '''
    split_idxs = [i for i in range(len(tx))]
    
    # Shuffle the indices randomly
    shuffle(split_idxs)
    
    tx_shuffled = []
    ty_shuffled = []
    for i in range(len(split_idxs)):
        tx_shuffled.append(tx[split_idxs[i]])
        ty_shuffled.append(ty[split_idxs[i]])
    
    # Split by ratio
    split_pos = int(len(tx) * ratio)
    x_train = tx_shuffled[:split_pos]
    x_test = tx_shuffled[split_pos:]
    y_train = ty_shuffled[:split_pos]
    y_test = ty_shuffled[split_pos:]
    
    return x_train, y_train, x_test, y_test

def inbalance(labels):
    '''
    Computes the proportions of the 1s and (-1)s to balance penalizer terms
    when calculating the loss
    '''
    size = len(labels)
    c = Counter(labels)
    corr_1 = (0.5*size)/c[-1]
    corr_2= (0.5*size)/c[1]
    return (corr_1, corr_2)

def test(input_, q,j):
    res = stub1.ComputeTask(input_)
    q.put(dict(res.grad_up))

In [3]:
# Reading of the files 
file1 = open("../lyrl2004_vectors_test_pt0.dat")
file2 = open("../lyrl2004_vectors_test_pt1.dat")
file3 = open("../lyrl2004_vectors_test_pt2.dat")
file4 = open("../lyrl2004_vectors_test_pt3.dat")
files = [file1, file2, file3, file4]

# Parsing the files and storing the ids of the articles along with their respective features
id_ = []
samples = []
for f in files:
    for i in f.readlines():
        id_.append(i.split()[0])
        samples.append(i.strip().split()[2:])
categories = open("../rcv1-v2.topics.qrels")

In [4]:
# Get a mapping of each article id to the set of categories it belongs to
cat = []
cat_count = {}
id_to_labels = {}
for line in open("../rcv1-v2.topics.qrels").readlines():
    s = line.split(' ')
    id_to_labels.setdefault(s[1],[]).append(s[0])
    cat_count.setdefault(s[0],[]).append(s[1])
    cat.append(s[0])

In [5]:
# Set the label vector based on the chosen category
# Set the features matrix as sparse matrix by storing only the non zero components in a dict 
y = set_labels("ECAT", id_, id_to_labels)
X = []
for sample in samples:
    d = dict()
    for feature in sample:
        key = int(feature.split(':')[0])
        value = float(feature.split(':')[1])
        d[key] = value
    X.append(d)

In [6]:
# Perform a ratio count of the positive examples within each class
count = dict(cat_count)
sum_count = sum(list(map(lambda x: len(x),count.values())))
for k in count:
    count[k] = (len(count[k])/len(y))*100

sorted(count.items(), key=operator.itemgetter(1), reverse=True)[:5]

[('CCAT', 48.80891886875772),
 ('GCAT', 30.625587988710617),
 ('MCAT', 26.216456644032434),
 ('C15', 19.428106980345976),
 ('ECAT', 15.349465290266426)]

In [7]:
# Split the data into training and test sets
x_train, y_train, x_test, y_test = split_data(X, y, ratio=0.8)

In [8]:
# Compute the 'balancing' ratios (used to fairly penalize the negative vs positive examples)
c1, c2 = inbalance(y_train)

In [9]:
print(c1, c2)

0.5876636259183831 3.3518099426181154


In [10]:
# Pick a random sample from the training set
rd = randint(0,len(x_train))    
batch = x_train[rd]
lab = y_train[rd]

In [11]:
# Set up 2 channels for the communication
channel1 = grpc.insecure_channel('localhost:50051')
channel2 = grpc.insecure_channel('localhost:52251')
stub1 = sgd_pb2_grpc.SGDStub(channel1)
stub2 = sgd_pb2_grpc.SGDStub(channel2)

In [12]:
from tqdm import tqdm

In [17]:
# Define the hyperparameters for the SGD
max_iter = 10000
worker = 10
q = queue.Queue()
weights = zeros(47237)
l_rate = 0.3
lambda_ = 0.001

# Performs the SVM using SGD
for i in tqdm(range(max_iter)):
    if(i%100 == 0):
        print(i)
    results = []
    threads = []
    for j in range(worker):
        rd = randint(0,len(x_train))
        batch = x_train[rd]
        lab = y_train[rd]
        args_ = (sgd_pb2.LWB(labels=lab, weights=weights, batch=batch, corr_1=c1, corr_2=c2, lambda_= lambda_),q,j)
        t = th.Thread(target=test, args=args_)
        threads.append(t)
        t.start()
    for j in range(worker):
        threads[j].join()
    
    while not q.empty():
        results.append(q.get())
    for update in results:
        for x in update:
            weights[x] -= l_rate * update[x]
    #print(calculate_primal(labels,X,weights,0.01))
#print(calculate_primal(y_train, x_train, weights,0.01))
print('Training accuracy = {}'.format(accuracy(prediction(x_train, weights), y_train)))
print('Test accuracy = {}'.format(accuracy(prediction(x_test, weights), y_test)))

Training accuracy = 0.907505775889103
Test accuracy = 0.9054098161315303


In [17]:
import pandas as pd

In [21]:
len(pd.Series(weights).nonzero()[0])/len(pd.Series(weights))

20310

In [126]:
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve

In [125]:
f = f1_score(labels,prediction(X, weights))

In [127]:
p = precision_score(labels,prediction(X, weights))
r = recall_score(labels,prediction(X, weights))

In [128]:
print(f,p,r)

0.740962516982 0.643446356275 0.873316104438
