In [1]:
import os
import numpy as np
import pandas as pd
import json
import lightgbm as lgb
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import balanced_accuracy_score as bacc
from gensim.models import Word2Vec

In [2]:
graphnames = ['Cora', 'Citeseer', 'Pubmed']
graphname = graphnames[0]
emb_size = 10
data_dir = os.path.expanduser("/home/koki/Desktop/Data/Graphs/"+graphname)

In [3]:
trained_embsize = 50

In [4]:
rwalk_path = os.path.join(data_dir, "vectors_rwalk_all_" + str(emb_size) + ".json")
with open(rwalk_path, "r") as read_file:
    rwalk_vectors = json.load(read_file)

In [5]:
minwise_path = os.path.join(data_dir, "vectors_minwise_all_" + str(emb_size) + ".json")
with open(minwise_path, "r") as read_file:
    minwise_vectors = json.load(read_file)

In [6]:
l1_path = os.path.join(data_dir, "vectors_l1_all_" + str(emb_size) + ".json")
with open(l1_path, "r") as read_file:
    l1_vectors = json.load(read_file)

In [7]:
def get_pairs(vectors):
    pairs = []
    cnt = 0
    for node, features in vectors.items():
        for feature in features:
            pairs.append([node, feature[0]])
    return pairs

In [8]:
rwalk_pairs = get_pairs(rwalk_vectors)

In [9]:
minwise_pairs = get_pairs(minwise_vectors)

In [10]:
l1_pairs = get_pairs(l1_vectors)

In [11]:
len(rwalk_pairs), len(minwise_pairs), len(l1_pairs)

(27080, 27080, 27080)

In [12]:
print(rwalk_pairs[:3])

[['35', '3231'], ['35', '576257'], ['35', '132806']]


In [13]:
model_rwalk_exists = os.path.exists(data_dir + \
                            '/w2v_rwalk_emb_'+str(emb_size)+ '_dim_' + str(trained_embsize) +'.model')
print(model_rwalk_exists)

True


In [14]:
if model_rwalk_exists:
    model_rwalk = Word2Vec.load(data_dir + \
                            '/w2v_rwalk_emb_'+str(emb_size)+ '_dim_' + str(trained_embsize) + '.model')
else:
    model_rwalk = Word2Vec(
                rwalk_pairs, 
                size=trained_embsize, 
                window=2, 
                min_count=0, 
                sg=1, 
                workers=4, 
                #negative=1,
                iter=300)

In [15]:
if not model_rwalk_exists:
    model_rwalk.save(data_dir + '/w2v_rwalk_emb_'+str(emb_size)+'_dim_' + str(trained_embsize) +'.model')

In [16]:
model_minwise_exists = os.path.exists(data_dir + \
                            '/w2v_minwise_emb_'+str(emb_size)+ '_dim_' + str(trained_embsize) + '.model')
print(model_minwise_exists)

True


In [17]:
if model_minwise_exists:
    model_minwise = Word2Vec.load(data_dir + \
                                  '/w2v_minwise_emb_'+str(emb_size)+'_dim_' + str(trained_embsize) + '.model')
else:
    model_minwise = Word2Vec(
                minwise_pairs, 
                size=trained_embsize, 
                window=2, 
                min_count=0, 
                sg=1, 
                workers=4, 
                #negative=2,
                iter=300)

In [18]:
if not model_minwise_exists:
    model_minwise.save(data_dir + '/w2v_minwise_emb_'+str(emb_size)+ '_dim_' + str(trained_embsize) + '.model')

In [19]:
model_l1_exists = os.path.exists(data_dir + \
                                 '/w2v_l1_emb_'+str(emb_size)+ '_dim_' + str(trained_embsize) + '.model')
print(model_l1_exists)

True


In [20]:
if model_l1_exists:
    model_l1 = Word2Vec.load(data_dir + '/w2v_l1_emb_'+str(emb_size)+'_dim_' + str(trained_embsize)+'.model')
else:
    model_l1 = Word2Vec(
                minwise_pairs, 
                size=trained_embsize, 
                window=2, 
                min_count=0, 
                sg=1, 
                workers=4, 
                #negative=2,
                iter=300)

In [21]:
if not model_l1_exists:
    model_l1.save(data_dir + '/w2v_l1_emb_'+str(emb_size)+'_dim_' + str(trained_embsize)+'.model')

In [22]:
nodes_with_labels = pd.read_csv(data_dir + '/nodes_with_labels.csv')

In [23]:
def get_X_y(model, nodes_with_labels):
    X = []
    y = []
    labels = {}
    for idx, row in nodes_with_labels.iterrows():
        X.append([float(x) for x in model[str(row['node'])]])
        if row['label'] not in labels:
            labels[row['label']] = len(labels)
        y.append(labels[row['label']])
    X = np.array(X)
    return X, y

In [24]:
X_rw, y_rw = get_X_y(model_rwalk, nodes_with_labels)

  


In [25]:
X_mw, y_mw = get_X_y(model_minwise, nodes_with_labels)

  


In [26]:
X_l1, y_l1 = get_X_y(model_l1, nodes_with_labels)

  


In [27]:
def get_mean_bacc(X, y, nr_iters):
    lgb_params = {'objective':'multiclass',
                  'metric': 'multi_error',
            'boosting_type':'gbdt',
            'n_jobs':4,
            'max_depth':-1,
             'num_class': len(set(y)),
            'learning_rate':0.1,
            'tree_learner':'serial',
            'n_estimators':2000,
            'verbose':-1,
            'seed': 73,
            'feature_fraction':1,
            'badding_seed' : 1}
    accs = []    
    for i in range(nr_iters):
        print('Iter', i)
        X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)
        clf = lgb.LGBMClassifier( **lgb_params) 
        X_train_clf, X_test, y_train_clf, y_test =  train_test_split(X_train, y_train)
        clf.fit(X_train_clf, y_train_clf, eval_set=[(X_train_clf, y_train_clf), (X_test, y_test)], 
                    early_stopping_rounds=50, verbose=100)
        y_pred = clf.predict(X_val)
        acc = bacc(y_pred, y_val)
        accs.append(acc)
        print('Balanced accuracy score', acc)
    mean = 100.0*np.round(np.mean(accs), 3)
    std = 100.0*np.round(np.std(accs), 3)
    print("{}\% $\pm$ {}\%".format(np.round(mean, 1), np.round(std, 1)))
        

In [28]:
get_mean_bacc(X_rw, y_rw, 10)

Iter 0
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi_error: 0.278598
Early stopping, best iteration is:
[77]	training's multi_error: 0	valid_1's multi_error: 0.263838
Balanced accuracy score 0.78211801098798
Iter 1
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi_error: 0.247232
[200]	training's multi_error: 0	valid_1's multi_error: 0.232472
[300]	training's multi_error: 0	valid_1's multi_error: 0.221402
Early stopping, best iteration is:
[331]	training's multi_error: 0	valid_1's multi_error: 0.215867
Balanced accuracy score 0.7738339783784277
Iter 2
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi_error: 0.225092
Early stopping, best iteration is:
[61]	training's multi_error: 0	valid_1's multi_error: 0.217712
Balanced accuracy score 0.7981901925613242
Iter 3
Training until validation scores don't improve 

In [29]:
get_mean_bacc(X_mw, y_mw, 10)

Iter 0
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi_error: 0.202952
Early stopping, best iteration is:
[120]	training's multi_error: 0	valid_1's multi_error: 0.195572
Balanced accuracy score 0.7968610104875262
Iter 1
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi_error: 0.197417
[200]	training's multi_error: 0	valid_1's multi_error: 0.182657
Early stopping, best iteration is:
[213]	training's multi_error: 0	valid_1's multi_error: 0.178967
Balanced accuracy score 0.7999360323828035
Iter 2
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi_error: 0.221402
Early stopping, best iteration is:
[67]	training's multi_error: 0	valid_1's multi_error: 0.214022
Balanced accuracy score 0.7877491097508839
Iter 3
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi

In [30]:
get_mean_bacc(X_l1, y_l1, 10)

Iter 0
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi_error: 0.208487
[200]	training's multi_error: 0	valid_1's multi_error: 0.195572
Early stopping, best iteration is:
[193]	training's multi_error: 0	valid_1's multi_error: 0.193727
Balanced accuracy score 0.823056358385177
Iter 1
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi_error: 0.206642
Early stopping, best iteration is:
[135]	training's multi_error: 0	valid_1's multi_error: 0.193727
Balanced accuracy score 0.8314301579985722
Iter 2
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi_error: 0.191882
Early stopping, best iteration is:
[116]	training's multi_error: 0	valid_1's multi_error: 0.182657
Balanced accuracy score 0.8166035008150239
Iter 3
Training until validation scores don't improve for 50 rounds
[100]	training's multi_error: 0	valid_1's multi

In [31]:
X_mw[0]

array([ 0.54943544, -0.03144258,  0.1593027 ,  0.34443098,  0.62527102,
        1.39308059,  0.50897878,  0.18592592, -0.26989779,  0.62045872,
        0.77887779, -0.49794203,  0.67839801, -0.60232335,  0.21792775,
       -0.67680496, -0.65120906,  0.18713106, -0.42362121, -0.93551505,
       -0.74547589,  0.27052885,  0.97278225, -0.4303042 , -0.55773205,
       -0.35216695, -0.24470426, -0.1260681 ,  0.34674758, -0.00248211,
       -0.25157702, -0.13554797, -0.87472546,  0.32508022, -0.04656049,
       -0.12327956, -0.89316028, -1.1099757 ,  0.05556352, -0.02967277,
        0.64539856,  1.2539674 , -0.9514541 ,  0.07091854,  0.10108402,
        0.03459425,  0.37477243, -0.33428699, -0.57108504, -0.77540177])

In [32]:
Counter(y_l1), Counter(y_rw)

(Counter({0: 818, 1: 180, 2: 217, 3: 426, 4: 351, 5: 418, 6: 298}),
 Counter({0: 818, 1: 180, 2: 217, 3: 426, 4: 351, 5: 418, 6: 298}))