### Data loading

In [10]:
import numpy as np
from collections import defaultdict


def read_input_graph(filename, verticles):
    
    vec_map = defaultdict(int)
    i =0
    for x in verticles:
        vec_map[x] = i
        i +=1
        
    neighbours = {}
    for line in open(filename):
        line = line.replace('[', ' ').replace(']', ' ')
        verticles = line.rstrip().split(' ')
        if len(verticles) >0:
            neighbours[vec_map[int(verticles[0])]] = list(map( lambda x : vec_map[int(x)], verticles[1::2]))
    
    return neighbours, vec_map

def read_output_data(filename):
    output = np.genfromtxt(filename, delimiter=' ')
    ids_output = output[..., 0].astype(np.intc)
    vectors = output[..., 1:]
    return vectors, ids_output


### Preprocessing

In [16]:
def get_X_y(vectors, neighbours, data_len=1000):
    """
        Returns 
        X: fabs(a - b)
        y: label 1 -> connected 
    """
    rand_idxs  = np.arange(vectors.shape[0])
    np.random.shuffle(rand_idxs)

    i = 0
    j=0
    b_true = []
    while len(b_true) < data_len//2 and i<rand_idxs.shape[0]:
        n = neighbours.get(rand_idxs[i])
        if n:
            b_true.append(np.random.choice(n, 1))
        i+=1

    b_true = np.array(b_true)
    data_len = min(data_len//2, b_true.shape[0])
    
    np.random.shuffle(rand_idxs)
    b_false = rand_idxs[:data_len]

    b = np.concatenate((b_true.reshape(b_true.shape[0], 1), b_false.reshape(b_false.shape[0], 1)), axis=0)

    y = np.concatenate((np.ones(b_true.shape[0]), np.zeros(b_false.shape[0])), axis=0).astype(np.intc)

    a = rand_idxs[:y.shape[0]]
    fabs_x = np.fabs(vectors[a]-vectors[b].reshape(y.shape[0], 48))    
    return fabs_x, y



### Score

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score



def score_regression(X, y, X_test, y_test):
    
    lr = LogisticRegression()
    clf = lr.fit(X, y)

    p = clf.predict(X_test)
    f1 = f1_score(y_test, p)
    accuracy = accuracy_score(y_test, p)
    recall = recall_score(y_test, p)
    prec = precision_score(y_test, p)
    print("score:")
    print("f1: {}".format(f1))
    print("accuracy: {}".format(accuracy))
    print("recall: {}".format(recall))
    print("prec: {}".format(prec))
    return f1, accuracy, recall, prec

In [19]:

vectors, ids_temp = read_output_data('dblp-mini-output/dblp-mini-output/1.out')
n, _ = read_input_graph('2017.in', ids_temp)

X, y = get_X_y(vectors, n)

vectors_t, ids_t = read_output_data('dblp-mini-output/dblp-mini-output/2.out')
neighbours_t, _ = read_input_graph('2018.in', ids_t)

X_test, y_test = get_X_y(vectors_t, neighbours_t)

score_regression(X, y, X_test, y_test)

score:
f1: 0.5806451612903227
accuracy: 0.6130952380952381
recall: 0.5357142857142857
prec: 0.6338028169014085


(0.5806451612903227,
 0.6130952380952381,
 0.5357142857142857,
 0.6338028169014085)