## Link prediction using the discrete embeddings

In [1]:
import os
import numpy as np
import pandas as pd
import json
from sklearn import svm, datasets
import matplotlib.pyplot as plt
import random
import copy
from collections import Counter
import lightgbm as lgb
from sklearn import svm
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score as acc
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_curve, auc, f1_score
from scipy import interp
import gc

In [2]:
graphnames = ['Cora', 'Citeseer', 'Pubmed', 'HomoSapiens', 'Wikipedia', 'BlogCatalog']
graphname = graphnames[4]
emb_size = 25
hop = 2
data_dir = os.path.expanduser("../Graphs/"+graphname)

In [3]:
rwalk_path = data_dir + "/vectors/vectors_rwalk_reduced_" + str(emb_size) + "_hop_" + str(hop) + ".json"
with open(rwalk_path, "r") as read_file:
    rwalk_vectors = json.load(read_file)

In [4]:
ns_path = data_dir + "/vectors/vectors_nodesketch_reduced_" + str(emb_size) + "_hop_" + str(hop) + ".json"
with open(ns_path, "r") as read_file:
    ns_vectors = json.load(read_file)

In [5]:
minwise_path = data_dir + "/vectors/vectors_minwise_reduced_" + str(emb_size) + "_hop_" + str(hop) + ".json"
with open(minwise_path, "r") as read_file:
    minwise_vectors = json.load(read_file)

In [6]:
l1_path = data_dir + "/vectors/vectors_l1_reduced_" + str(emb_size) + "_hop_" + str(hop) + ".json"
with open(l1_path, "r") as read_file:
    l1_vectors = json.load(read_file)

In [7]:
l2_path = data_dir + "/vectors/vectors_l2_reduced_" + str(emb_size) + "_hop_" + str(hop) + ".json"
with open(l2_path, "r") as read_file:
    l2_vectors = json.load(read_file)

In [8]:
# convert all per-node samples to vectors with categorical features 
def vectors_to_df(path_edges, vectors, x, emb_size=emb_size):
    edges_path = data_dir + path_edges 
    key = list(vectors.keys())[0]
    features = []
    labels = []
    nodes = set()
    edgeset = set()
    print('emb size', emb_size)
    with open(edges_path, 'r') as f:
        for nr, edge in enumerate(f):
            if nr % 10000 == 0:
                print(nr)
            vector_edge = {}
            edge_split = edge.split(':')
            u = edge_split[0].strip()
            v = edge_split[1].strip()
            nodes.add(u)
            nodes.add(v)
            if u not in vectors or v not in vectors:
                continue
            if len(vectors[u]) < emb_size or len(vectors[v]) < emb_size:
                continue
            edgeset.add((u,v))
            for i in range(emb_size): 
                u_i = vectors[u][i] 
                vector_edge['f' + str(2*i)] = str(u_i[x])
                v_i = vectors[v][i]
                vector_edge['f' + str(2*i+1)] = str(v_i[x])
                # vector_edge['f' + str(i)] = int(u_i[x]==v_i[x])
            features.append(vector_edge)
            labels.append(1)
    nr_pos = len(labels)
    print('Number of positive examples', nr_pos)
    nodeslist = list(nodes)
    for l in range(4*nr):
        u = nodeslist[random.randint(0, len(nodeslist)-1)]
        v = nodeslist[random.randint(0, len(nodeslist)-1)]
        if (u, v) not in edgeset and (v, u) not in edgeset:
            if u not in vectors or v not in vectors:
                continue
            if len(vectors[u]) < emb_size or len(vectors[v]) < emb_size:
                continue
            vector_edge = {}
            for i in range(emb_size): 
                u_i = vectors[u][i]
                vector_edge['f' + str(2*i)] = str(u_i[x])
                v_i = vectors[v][i]
                vector_edge['f' + str(2*i+1)] = str(v_i[x])
                #vector_edge['f' + str(i)] = int(u_i[x]==v_i[x])
            features.append(vector_edge)
            labels.append(0)
    print('Number of negative examples', len(labels)-nr_pos)        
    return pd.DataFrame(features), labels

In [9]:
minwise_vectors

{'0': [['1313', 'label=0=', '1313'],
  ['4370', 'label=7=', '4370'],
  ['1674', 'label=0=', '1674'],
  ['2600', 'label=6=', '2600'],
  ['3783', 'label=0=', '3783'],
  ['2851', 'label=0=8=', '2851'],
  ['1963', 'label=5=8=', '1963'],
  ['3358', 'label=0=', '3358'],
  ['4482', 'label=6=', '4482'],
  ['2102', 'label=0=14=', '2102'],
  ['465', 'label=0=14=19=', '465'],
  ['3481', 'label=14=19=', '3481'],
  ['197', 'label=0=28=', '197'],
  ['4752', 'label=0=', '4752'],
  ['866', 'label=8=14=', '866'],
  ['1526', 'label=8=', '1526'],
  ['3720', 'label=0=', '3720'],
  ['4506', 'label=7=', '4506'],
  ['2318', 'label=0=', '2318'],
  ['3519', 'label=8=', '3519'],
  ['2221', 'label=0=14=', '2221'],
  ['1956', 'label=7=', '1956'],
  ['1375', 'label=0=6=', '1375'],
  ['1938', 'label=0=', '1938'],
  ['4162', 'label=0=', '4162']],
 '1': [['1313', 'label=0=', '1313'],
  ['4370', 'label=7=', '4370'],
  ['1674', 'label=0=', '1674'],
  ['2600', 'label=6=', '2600'],
  ['3783', 'label=0=', '3783'],
  ['285

In [10]:
l1_vectors

{'0': [['1402', 'label=3=', '820'],
  ['239', 'label=3=', '4'],
  ['5', 'label=3=', '60'],
  ['166', 'label=3=', '369'],
  ['3107', 'label=3=', '1788'],
  ['3396', 'label=3=', '26'],
  ['562', 'label=3=', '2329'],
  ['281', 'label=3=', '3'],
  ['1499', 'label=3=', '1149'],
  ['793', 'label=3=', '1865'],
  ['2657', 'label=3=', '1503'],
  ['2', 'label=3=', '26'],
  ['44', 'label=3=', '261'],
  ['1392', 'label=3=', '2127'],
  ['880', 'label=3=', '1815'],
  ['1168', 'label=3=', '0'],
  ['310', 'label=3=', '75'],
  ['40', 'label=3=', '26'],
  ['735', 'label=3=', '379'],
  ['3005', 'label=3=', '50'],
  ['1', 'label=3=', '1126'],
  ['11', 'label=3=', '1003'],
  ['925', 'label=3=', '54'],
  ['122', 'label=3=', '5'],
  ['299', 'label=3=', '59']],
 '1': [['438', 'label=16=', '820'],
  ['3572', 'label=16=', '4'],
  ['12', 'label=16=', '60'],
  ['166', 'label=16=', '4160'],
  ['343', 'label=16=', '1788'],
  ['4262', 'label=16=', '26'],
  ['562', 'label=16=', '1891'],
  ['4515', 'label=16=', '3'],


In [11]:
x = 2 # indicates what features to use, simply other nodes ids, the node labels or the words describing the nodes

In [12]:
paths = ['graph_nodes.txt', 'labels.txt', 'words_indices.txt']
features_path = data_dir + "/data/" + paths[x] 
features = []
with open(features_path, "r") as features_file:
    for f in features_file:
        features.append(f.strip())

In [13]:
features[:5]

['1169', '2048', '2248', '1916', '4515']

In [14]:
X_rwalk, y_rwalk = vectors_to_df(path_edges="/data/graph_edges_reduced.txt", vectors=rwalk_vectors, x=x)

emb size 25
0
10000
20000
30000
40000
50000
60000
70000
Number of positive examples 74013
Number of negative examples 294087


In [15]:
X_ns, y_ns = vectors_to_df(path_edges="/data/graph_edges_reduced.txt", vectors=ns_vectors, x=x)

emb size 25
0
10000
20000
30000
40000
50000
60000
70000
Number of positive examples 74013
Number of negative examples 294167


In [16]:
X_minwise, y_minwise = vectors_to_df(path_edges="/data/graph_edges_reduced.txt", vectors=minwise_vectors, x=x)

emb size 25
0
10000
20000
30000
40000
50000
60000
70000
Number of positive examples 74013
Number of negative examples 294223


In [17]:
X_minwise.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49
0,1313,1313,4370,4370,1674,1674,2600,2600,3783,3783,...,2221,2221,1956,1956,1375,1375,1938,1938,4162,4162
1,1313,1313,4370,4370,1674,1674,2600,2600,3783,3783,...,2221,2221,1956,1956,1375,1375,1938,1938,4162,4162
2,1313,1313,4370,4370,1674,1674,2600,2600,3783,3783,...,2221,2221,1956,1956,1375,1375,1938,1938,4162,4162
3,1313,1313,4370,4370,1674,1674,2600,2600,3783,3783,...,2221,2221,1956,1956,1375,1375,1938,1938,4162,4162
4,1313,1313,4370,4370,1674,1674,2600,2600,3783,3783,...,2221,2221,1956,1956,1375,1375,1938,1938,4162,4162


In [17]:
X_l1, y_l1 = vectors_to_df(path_edges="/data/graph_edges_reduced.txt", vectors=l1_vectors, x=x)

emb size 25
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
Number of positive examples 327303
Number of negative examples 1301088


In [18]:
X_l2, y_l2 = vectors_to_df(path_edges="/data/graph_edges_reduced.txt", vectors=l2_vectors, x=x)

emb size 25
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
Number of positive examples 327303
Number of negative examples 1301054


In [19]:
X_val_rwalk, y_val_rwalk = vectors_to_df(path_edges="/data/removed_edges.txt", vectors=rwalk_vectors, x=x)

emb size 25
0
Number of positive examples 6680
Number of negative examples 26694


In [20]:
X_val_ns, y_val_ns = vectors_to_df(path_edges="/data/removed_edges.txt", vectors=ns_vectors, x=x)

emb size 25
0
Number of positive examples 6680
Number of negative examples 26700


In [21]:
X_val_minwise, y_val_minwise = vectors_to_df(path_edges="/data/removed_edges.txt", vectors=minwise_vectors, x=x)

emb size 25
0
Number of positive examples 6680
Number of negative examples 26699


In [22]:
X_val_l1, y_val_l1 = vectors_to_df(path_edges="/data/removed_edges.txt", vectors=l1_vectors, x=x)

emb size 25
0
Number of positive examples 6680
Number of negative examples 26692


In [23]:
X_val_l2, y_val_l2 = vectors_to_df(path_edges="/data/removed_edges.txt", vectors=l2_vectors, x=x)

emb size 25
0
Number of positive examples 6680
Number of negative examples 26695


In [24]:
X_rwalk.shape, X_ns.shape, X_minwise.shape, X_l1.shape, X_l2.shape

((1628433, 50), (1628450, 50), (1628587, 50), (1628391, 50), (1628357, 50))

In [25]:
X_val_rwalk.shape, X_val_ns.shape, X_val_minwise.shape, X_val_l1.shape, X_val_l2.shape

((33374, 50), (33380, 50), (33379, 50), (33372, 50), (33375, 50))

In [26]:
X_val_l1.head(3)

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49
0,9515,9515,8458,8458,677,677,2807,2807,3158,3158,...,4839,8192,1644,1644,6816,6816,5625,5625,6660,7362
1,9515,9515,8458,8458,677,1076,2807,2807,3158,5244,...,4839,8012,1644,1644,6816,6816,5625,5356,6660,7362
2,9515,9515,8458,8458,677,1076,2807,2807,3158,3158,...,4839,8192,1644,1644,6816,6816,5625,8738,6660,7362


In [27]:
X_l1.head(3)

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49
0,9515,9515,8458,8458,1076,677,2529,2807,3158,3158,...,2695,4839,1644,1644,6816,6816,8738,5625,7362,6660
1,9515,9515,8458,3693,1076,1076,2529,2807,3158,3158,...,2695,4839,1644,1644,6816,6816,8738,5625,7362,6660
2,9515,9515,8458,8458,1076,677,2529,2807,3158,3158,...,2695,4839,1644,1644,6816,6816,8738,5625,7362,6660


In [28]:
def predict(model, X, label_encoder):
    X_local = copy.deepcopy(X)
    for c in X_local.columns: 
        X_c = [str(x) for x in X_local[c]]
        X_local.loc[:, c] = label_encoder.transform(X_c)
    return model.predict(X_local)    
    

In [29]:
def get_mean_acc(X_train, y_train, X_val, y_val, features, nr_iters, res_path):
    
    accs = []  
    f1s = []
    aucs = []
    label_encoder = LabelEncoder()
    label_encoder.fit(features)
    
    X_train = copy.deepcopy(X_train)
        #encoders = {}
    for c in X_train.columns: 
        X_train.loc[:, c] = label_encoder.transform([str(x) for x in X_train[c]])
            
    X_val = copy.deepcopy(X_val)
    for c in X_val.columns: 
        #label_encoder = encoders[c]
        X_val.loc[:, c] = label_encoder.transform([str(x) for x in X_val[c]])
    
    models = [] 
    for i in range(nr_iters):
        print('Iter', i)
        
        X_train_clf, X_test, y_train_clf, y_test =  train_test_split(X_train, y_train, train_size=0.5)
        clf =  tree.DecisionTreeClassifier() #max_leaf_nodes=100, random_state=0) 
        clf.fit(X_train_clf, y_train_clf)
        models.append(clf)
        y_pred = clf.predict(X_val)
#         model = lgbm_clf(X_train_clf, y_train_clf, X_test, y_test, label_encoder)
#         y_pred = model.predict(X_val) #predict(model, X_val, label_encoder)
        accuracy = acc(y_val, y_pred)
        accs.append(accuracy)
        
        f1 = f1_score(y_val, y_pred)
        f1s.append(f1)
        
        fpr, tpr, thresholds = roc_curve(y_val, y_pred)
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        print('AUC', roc_auc)
        print('F1', f1)
    print('Accuracy', np.mean(accs), np.std(accs))
    print('AUC', np.mean(aucs), np.std(aucs))
    print('F1', np.mean(f1s), np.std(f1s))
    
    
    with open(res_path + "_accuracy.txt", "w") as f:
        f.write(str(np.mean(accs)) + " " + str(np.std(accs)))
    with open(res_path + "_F1.txt", "w") as f:
        f.write(str(np.mean(f1s)) + " " + str(np.std(f1s)))
    with open(res_path + "_AUC.txt", "w") as f:
        f.write(str(np.mean(aucs)) + " " + str(np.std(aucs)))
        
    return models[0], label_encoder
        

    



In [30]:
nr_iters = 3

In [31]:
res_path = data_dir + "/results/rwalk_" + str(emb_size) + "_hop_" + str(hop) 
clf_rw, le = get_mean_acc(X_rwalk, y_rwalk, X_val_rwalk, y_val_rwalk, features, nr_iters, res_path)

Iter 0
AUC 0.7486227141132435
F1 0.5662203784021622
Iter 1
AUC 0.7548908981318101
F1 0.5733358288102814
Iter 2
AUC 0.7497253526213477
F1 0.566541822721598
Accuracy 0.7934120373144764 0.001291405536923595
AUC 0.7510796549554671 0.0027322925455624627
F1 0.5686993433113473 0.0032811156587350815


In [32]:
res_path = data_dir + "/results/nodesketch_" + str(emb_size) + "_hop_" + str(hop) 
get_mean_acc(X_ns, y_ns, X_val_ns, y_val_ns, features, nr_iters, res_path)

Iter 0
AUC 0.7594735809280315
F1 0.5756335877862596
Iter 1
AUC 0.7643386261185495
F1 0.581044957472661
Iter 2
AUC 0.7606705689744108
F1 0.5764319591390002
Accuracy 0.7921809466746556 0.000893065089487185
AUC 0.761494258673664 0.002069784967404009
F1 0.5777035014659736 0.0023851408606467062


(DecisionTreeClassifier(), LabelEncoder())

In [33]:
res_path = data_dir + "/results/minwise_" + str(emb_size) + "_hop_" + str(hop) 
clf_l0, le_l0 = get_mean_acc(X_minwise, y_minwise, X_val_minwise, y_val_minwise, features, nr_iters, res_path)

Iter 0
AUC 0.7427576286806139
F1 0.5405646691217498
Iter 1
AUC 0.7436939344652393
F1 0.542037254572915
Iter 2
AUC 0.7425138795034374
F1 0.5399777282850778
Accuracy 0.7535076944585918 0.0009270617626072486
AUC 0.7429884808830969 0.0005086596653702003
F1 0.5408598839932476 0.000866324034003481


In [34]:
res_path = data_dir + "/results/l1_" + str(emb_size) + "_hop_" + str(hop) 
clf_l1, le_l1 = get_mean_acc(X_l1, y_l1, X_val_l1, y_val_l1, features, nr_iters, res_path)

Iter 0
AUC 0.767678523516432
F1 0.5846898638426626
Iter 1
AUC 0.7719667064791442
F1 0.5917881764382893
Iter 2
AUC 0.7663680543902455
F1 0.5827742520398912
Accuracy 0.7956670262495505 0.002755933414995501
AUC 0.7686710947952738 0.0023909725511904196
F1 0.5864174307736144 0.0038773765951304094


In [35]:
res_path = data_dir + "/results/l2_" + str(emb_size) + "_hop_" + str(hop) 
clf_l2, le_l2 = get_mean_acc(X_l2, y_l2, X_val_l2, y_val_l2, features, nr_iters, res_path)

Iter 0
AUC 0.7804737453357006
F1 0.6026078479161588
Iter 1
AUC 0.7787891579642737
F1 0.6002922908293752
Iter 2
AUC 0.775868524236412
F1 0.5957059679767104
Accuracy 0.8027265917602996 0.0018116813732232274
AUC 0.7783771425121288 0.0019025128808916477
F1 0.5995353689074148 0.002868063780276214


# Interpretability

In [36]:
def decision_path(clf, label_encoder, X_test, y_test, sample_id):
    
    feature = clf.tree_.feature
    threshold = clf.tree_.threshold
    
    X_test = copy.deepcopy(X_test)
    for c in X_test.columns: 
        #label_encoder = encoders[c]
        X_test.loc[:, c] = label_encoder.transform([str(x) for x in X_test[c]])
    
    node_indicator = clf.decision_path(X_test)
    leaf_id = clf.apply(X_test)

    # obtain ids of the nodes `sample_id` goes through, i.e., row `sample_id`
    node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                        node_indicator.indptr[sample_id + 1]]

    print('Rules used to predict sample {id} with class {cl}:\n'.format(id=sample_id, cl=y_test[sample_id]))
    for node_id in node_index:
        # continue to the next node if it is a leaf node
        if leaf_id[sample_id] == node_id:
            continue

        print(feature[node_id])    
        # check if value of the split feature for sample 0 is below threshold
        if (X_test.iloc[sample_id, feature[node_id]] <= threshold[node_id]):
            threshold_sign = "<="
        else:
            threshold_sign = ">"
        
        print("decision node {node} : (X_test[{sample}, {feature}] = {value}) "
              "{inequality} {threshold})".format(
                  node=node_id,
                  sample=sample_id,
                  feature=feature[node_id],
                  value=label_encoder.inverse_transform([X_test.iloc[sample_id, feature[node_id]]]),
                  inequality=threshold_sign,
                  threshold=threshold[node_id]))
    print("prediction", clf.predict(X_test.iloc[[sample_id]]))

In [37]:
# text_representation = tree.export_text(clf_rw)
# print(text_representation)

In [38]:
#decision_path(clf_l1, le_l1, X_val_l1, y_val_l1, 100)

In [39]:
#decision_path(clf_l1, le_l1, X_val_l1, y_val_l1, 180)

In [40]:
# text_representation = tree.export_text(clf_l1)
# print(text_representation)

In [41]:
def get_distr(X, y, label, indices):
    
    for idx in indices:
        X_label = X.iloc[[i for i, y_i in enumerate(y) if y_i==label], :]
        # X_neg = X.iloc[[i for i, y_i in enumerate(y) if y_i==0], :]
        print(X_label.shape)

        dict_label = Counter(X_label.iloc[:, idx])
        # dict_neg = Counter(X_neg.iloc[:, idx])

        top_pos = sorted([(cnt/X_label.shape[0], k) for k, cnt in dict_label.items()], reverse=True)[:10]
        plt.bar([i for i in range(len(top_pos))], [c for c, _ in top_pos], alpha=0.7)
        plt.xticks(np.arange(len(top_pos)), [n for cnt, n in top_pos], rotation='vertical')
        # plt.yscale('log')
        plt.ylim([0,0.28])
        plt.show()
    

In [42]:
# get_distr(X_rwalk, y_rwalk, label=1, indices=[2])

In [43]:
# get_distr(X_l2, y_l2, label=0, indices=[16, 17])

In [44]:
# get_distr(X_l2, y_l2, label=1, indices=[16, 17])

In [45]:
# text_representation = tree.export_text(clf_l2)
# print(text_representation)

In [46]:
# decision_path(clf_l2, le_l2, X_val_l2, y_val_l2, 1)

In [47]:
# decision_path(clf_l2, le_l2, X_val_l2, y_val_l2, 10000)

In [48]:
def get_mean(X, y, val):
    return np.mean(X.loc[[i for i in range(len(y)) if y[i]==val]].mean(axis=1))

In [49]:
# get_mean(X_rwalk, y_rwalk, 0), get_mean(X_rwalk, y_rwalk, 1)

In [50]:
# get_mean(X_minwise, y_minwise, 0), get_mean(X_minwise, y_minwise, 1)

In [51]:
# get_mean(X_val_minwise, y_val_minwise, 0), get_mean(X_val_minwise, y_val_minwise, 1)

In [52]:
# get_mean(X_ns, y_ns, 0), get_mean(X_ns, y_ns, 1)

In [53]:
# get_mean(X_l1, y_l1, 0), get_mean(X_l1, y_l1, 1)

In [54]:
# get_mean(X_l2, y_l2, 0), get_mean(X_l2, y_l2, 1)

In [55]:
def get_auc_scores(X, y, X_val, y_val, lgb_params, label_encoder, n_splits):
    X_local = copy.deepcopy(X)
    for c in X_local.columns: 
        X_c = [str(x) for x in X_local[c]]
        X_local.loc[:, c] = label_encoder.transform(X_c)
        
    X_val_local = copy.deepcopy(X_val)
    for c in X_val_local.columns: 
        X_c = [str(x) for x in X_val_local[c]]
        X_val_local.loc[:, c] = label_encoder.transform(X_c)
    
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
        
    classifiers = []
    
    folds = StratifiedKFold(n_splits=n_splits, random_state=73, shuffle=True) 
    splits = folds.split(X_local, y)
    for fold_n, (train_index, test_index) in enumerate(splits):
        print('Fold', fold_n)
        X_train, X_test = X_local.iloc[train_index], X_local.iloc[test_index]
        y_train, y_test = np.array(y)[train_index], np.array(y)[test_index]
        print(X_train.shape, X_test.shape)
        dtrain = lgb.Dataset(X_train, label=y_train)
        dtest = lgb.Dataset(X_test, label=y_test)

        clf = lgb.train(lgb_params, dtrain, valid_sets=[dtrain, dtest], verbose_eval=1000)
        classifiers.append((clf, clf.best_iteration))

        y_pred = clf.predict(X_val_local, num_iteration=clf.best_iteration)
        fpr, tpr, thresholds = roc_curve(y_val, y_pred)
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        del X_train, X_test, y_train, y_test
        gc.collect()
        
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)


    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

    return mean_auc, std_auc, mean_fpr, mean_tpr, tprs_lower, tprs_upper
    

In [56]:
def plot_roc_lgbm(X_rw, y_rw, X_rw_val, y_rw_val, \
                  X_mw, y_mw, X_mw_val, y_mw_val, \
                  X_l1, y_l1, X_l1_val, y_l1_val,\
                  features, name, exclude_pred = True, n_splits = 3):
    
    
    label_encoder = LabelEncoder()
    label_encoder.fit(features)
    nr_trees = 3000
    lgb_params = {'objective':'binary',
            'boosting_type':'gbdt',
            'metric':'auc',
            'n_jobs':4,
            'learning_rate':0.1,
            'tree_learner':'serial',
            'max_depth': -1,
            'n_estimators':nr_trees,
            'verbose':-1,
            'seed': 1}
    
    mean_auc_rw, std_auc_rw, mean_fpr_rw, mean_tpr_rw, tprs_lower_rw, tprs_upper_rw = \
                get_auc_scores(X_rw, y_rw, X_rw_val, y_rw_val, lgb_params, label_encoder, n_splits)
    
    lgb_params = {'objective':'binary',
            'boosting_type':'gbdt',
            'metric':'auc',
            'n_jobs':4,
            'learning_rate':0.1,
            'tree_learner':'serial',
            'max_depth': -1,
            'n_estimators':nr_trees,
            'verbose':-1,
            'seed': 1}
    
    mean_auc_mw, std_auc_mw, mean_fpr_mw, mean_tpr_mw, tprs_lower_mw, tprs_upper_mw = \
                get_auc_scores(X_mw, y_mw, X_mw_val, y_mw_val, lgb_params, label_encoder, n_splits)
    
    lgb_params = {'objective':'binary',
            'boosting_type':'gbdt',
            'metric':'auc',
            'n_jobs':4,
            'learning_rate':0.1,
            'tree_learner':'serial',
            'max_depth': -1,
            'n_estimators':nr_trees,
            'verbose':-1,
            'seed': 1}
    
    mean_auc_l1, std_auc_l1, mean_fpr_l1, mean_tpr_l1, tprs_lower_l1, tprs_upper_l1 = \
                get_auc_scores(X_l1, y_l1, X_l1_val, y_l1_val, lgb_params, label_encoder, n_splits)

    alpha = 0.7
    markersize=6
    markevery=5
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
             label='Chance', alpha=alpha)

    plt.plot(mean_fpr_rw, mean_tpr_rw, color='b',
             label=r'AUC RW = %0.3f $\pm$ %0.4f' % (mean_auc_rw, std_auc_rw),
             lw=1, marker='o', markersize=markersize,  markevery=markevery, alpha=alpha)
   

    plt.plot(mean_fpr_mw, mean_tpr_mw, color='green',
             label=r'AUC $L_0$ = %0.3f $\pm$ %0.4f' % (mean_auc_mw, std_auc_mw),
             lw=1, markevery=markevery, marker='x', markersize=markersize, alpha=alpha)

    
    plt.plot(mean_fpr_l1, mean_tpr_l1, color='orange',
             label=r'AUC $L_1$ = %0.3f $\pm$ %0.4f' % (mean_auc_l1, std_auc_l1),
             lw=1, marker='D', markersize=markersize, markevery=markevery, alpha=alpha)

    plt.xlim([-0.05, 1.2])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate', fontsize=14)
    plt.ylabel('True Positive Rate', fontsize=14)
    mytitle = name 
    plt.title(mytitle, fontsize=15)
    plt.legend(loc="lower right")
    plt.show()

In [57]:
# plot_roc_lgbm(X_rwalk, y_rwalk, X_val_rwalk, y_val_rwalk,
#               X_minwise, y_minwise, X_val_minwise, y_val_minwise,
#               X_l1, y_l1, X_val_l1, y_val_l1,
#               features, graphname +' ROC')

In [58]:
def lgbm_clf(X, y, X_test, y_test, label_encoder):
        
#         X_local = copy.deepcopy(X)
#         #encoders = {}
#         for c in X.columns: 
#             X_local.loc[:, c] = label_encoder.transform([str(x) for x in X_local[c]])
#             #encoders[c] = label_encoder
            
#         X_test_local = copy.deepcopy(X_test)
#         for c in X_test_local.columns: 
#             #label_encoder = encoders[c]
#             X_c = [str(x) for x in X_test_local[c]]
#             X_test_local.loc[:, c] = label_encoder.transform(X_c)
            
        lgb_params = {'objective':'binary',
            'boosting_type':'gbdt',
            'n_jobs':4,
            'learning_rate':0.1,
            'tree_learner':'serial',
            'max_depth': -1,
            'lambda_l1':10,
            'lambda_l2':10,
            'n_estimators':1000,
            'verbose':-1,
            'seed': 1}
        
        clf = lgb.LGBMClassifier( **lgb_params) 
        clf.fit(X, y, eval_set=[(X, y), (X_test, y_test)], early_stopping_rounds=100, verbose=1000)
        return clf