In [90]:
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
from tqdm.notebook import tqdm

The detailed data can be downloaded here: http://snap.stanford.edu/conflict/conflict_data.zip

In [3]:
DATA_DIR = './prediction'

In [56]:

# loading handcrafted features
meta_features = {}
meta_labels = {}
with open(DATA_DIR+"/detailed_data/handcrafted_features.tsv") as fp:
    for line in fp:
        info = line.split()
        meta_features[info[0]] = np.array(list(map(float, info[-1].split(","))))
        meta_labels[info[0]] = 1 if info[1] == "burst" else 0

In [9]:

# loading the user, source, and target community embeddings for all examples
with open(DATA_DIR + "/detailed_data/full_ids.txt") as fp:
    ids = {id.strip():i for i, id in enumerate(fp.readlines())}


In [10]:

# loading the post embeddings from the LSTM 

lstm_ids = pickle.load(open(DATA_DIR + "/detailed_data/lstm_embeds-ids.pkl",'rb'))
lstm_ids = {id:i for i, id in enumerate(lstm_ids)}

In [21]:
all_embeds = pd.read_csv('./prediction/detailed_data/all_embeds.csv').values

In [23]:
lstm_embeds = pd.read_csv('./prediction/detailed_data/lstm_embeds.csv').values

In [25]:

# loading preprocessed lstm data to ensure identical train/val/test splits
train_data = pickle.load(open(DATA_DIR + "/preprocessed_train_data.pkl",'rb'))
val_data = pickle.load(open(DATA_DIR + "/preprocessed_val_data.pkl",'rb'))
test_data = pickle.load(open(DATA_DIR + "/preprocessed_test_data.pkl",'rb'))

In [50]:
# flattening the preprocessed LSTM data (no need for minibatching here....)
def flatten(data):
    ids, text, users, subreddits, lengths,sfs, labels = [], [], [], [], [], [], []
    for batch in data:
        bids, btext, busers, bsubreddits, blengths, bsfs, blabels = batch
        ids.extend([x.decode('utf-8') for x in bids])
        text.extend(btext.numpy().tolist())
        users.extend(busers.numpy().tolist())
        subreddits.extend(bsubreddits.numpy().tolist())
        lengths.extend(blengths)
        labels.extend(blabels)
        sfs.extend(bsfs)
    return (ids, text, users, subreddits, lengths, labels)
flat_train_data = flatten(train_data)
flat_val_data = flatten(val_data)
flat_test_data = flatten(test_data)

In [39]:
lst_ids_parse = [(x.decode('utf-8'),y) for x,y in list(lstm_ids.items())]

In [43]:
lstm_ids = dict(lst_ids_parse)

In [57]:

train_X = np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_train_data[0]])
val_X =  np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_val_data[0] if i in meta_features])
test_X =  np.stack([np.concatenate([meta_features[i],all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_test_data[0] if i in meta_features])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if i in meta_features])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if i in meta_features])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if i in meta_features])

In [58]:
# First we run the Random Forest with only the metadata/handcrafted features...
baseline_mod = RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)
# note that the first 263 features are the handcrafted ones... 
baseline_mod.fit(train_X[:, :263], train_Y)

RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)

In [60]:
# For reference, on the authors server we get 0.682
print(roc_auc_score(val_Y, baseline_mod.predict_proba(val_X[:, :263])[:,1]))

0.6823914377654168


In [61]:
roc_auc_score(test_Y, baseline_mod.predict_proba(test_X[:, :263])[:,1])


0.6657555806104909

In [62]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)

In [64]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.7555938981870011

In [67]:
df = pd.read_csv('soc-redditHyperlinks-body.tsv',sep='\t')

In [162]:
df_emb = pd.read_csv('embeddings.csv')

In [163]:
df_emb = df_emb.set_index('subreddit')

In [164]:
df['post_id_cropped'] = df['POST_ID'].apply(lambda x: x[:-1] if len(x) == 7 else x)

In [135]:
df_post_idx = df.set_index('post_id_cropped')

In [153]:
df_emb

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
leagueoflegends,-2.581650,4.197357,-0.414158,-4.637616,-1.270464,0.721880,3.302603,3.852562,0.082770,0.855891,...,-1.309510,2.200328,-0.537800,-0.944857,-3.069158,3.882328,4.642304,-4.706831,-1.937072,6.749583
theredlion,-5.078587,3.187933,1.306380,3.390276,0.009027,2.208565,0.663524,-2.170339,-3.299234,4.855823,...,-0.704552,-1.988924,1.746807,1.682895,0.917564,-2.376134,1.840948,1.383271,-2.815420,-0.156633
inlandempire,1.787371,1.144207,-1.939735,-0.874147,-1.687357,-1.447701,-2.463367,0.047332,3.669208,-5.642140,...,1.381191,1.326390,1.959474,-0.830282,-8.217797,-0.690313,-2.515662,0.354737,-0.242708,-0.834380
nfl,1.324327,2.414587,2.463422,1.304439,5.792188,2.861849,4.123739,1.847484,-2.223504,0.842010,...,2.977067,-0.718431,-0.049687,-0.443616,2.430853,-0.791676,0.064785,-1.367642,0.383723,5.842690
playmygame,3.036528,0.657404,1.333009,-2.353914,-3.739832,-1.316537,0.128526,0.776541,-3.212834,-0.607011,...,-1.821251,-0.415504,-3.183492,3.704080,2.105642,-5.455093,1.901566,1.531182,-0.390700,4.803727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mcchallenges,1.088152,-1.052172,-0.081496,0.599227,0.587108,-0.421348,-0.328897,-0.444257,0.726301,-0.338954,...,-0.529689,1.102415,-0.329409,0.948866,-0.232821,-1.136177,0.209243,0.079921,1.184984,-0.558518
zoomies,0.745022,0.203817,-1.079956,-1.680853,1.577193,-0.841061,-0.916199,0.172354,0.699129,0.416368,...,0.186617,0.441517,1.469669,0.824215,1.754825,-0.493108,0.523124,0.050339,0.626424,-1.088371
asshole,-0.537883,-0.661029,0.098713,-0.808670,0.923758,-0.627248,-0.254959,0.508056,0.135096,-0.354623,...,-0.117812,-0.571725,-0.966365,0.951634,0.411589,-0.488988,0.083463,1.102945,-1.133330,-0.873870
dildohero,-0.249038,-0.996826,0.725330,-0.713229,-0.866005,-0.326092,-0.751363,-1.660205,1.693578,-0.749767,...,0.024560,0.254378,0.024955,0.659600,0.813747,-1.120681,-0.736590,-0.005955,0.421525,-1.350236


In [175]:
train_gnn_embs = list()
not_found_ids_train = list()
for x in tqdm(flat_train_data[0]):
    chunk = df_post_idx.loc[x]
    if not isinstance(chunk,pd.core.series.Series):
        source = chunk.SOURCE_SUBREDDIT.iloc[0]

    else:
        source = chunk.SOURCE_SUBREDDIT
    try:
        train_gnn_embs.append(df_emb.loc[source].values.tolist())
    except KeyError:
        not_found_ids_train.append(x)
        
        

HBox(children=(FloatProgress(value=0.0, max=93696.0), HTML(value='')))




In [195]:
def get_gnn_embeddings(ids_list):
    gnn_embs = list()
    not_found_ids = list()
    for x in tqdm(ids_list):
        chunk = df_post_idx.loc[x]
        if not isinstance(chunk,pd.core.series.Series):
            source = chunk.SOURCE_SUBREDDIT.iloc[0]

        else:
            source = chunk.SOURCE_SUBREDDIT
        try:
            gnn_embs.append((x,df_emb.loc[source].values.tolist()))
        except KeyError:
            not_found_ids.append(x)
    return dict(gnn_embs),not_found_ids

In [196]:
train_gnn_embs,not_found_ids_train = get_gnn_embeddings(flat_train_data[0])

HBox(children=(FloatProgress(value=0.0, max=93696.0), HTML(value='')))




In [198]:
val_gnn_embs,not_found_ids_val = get_gnn_embeddings(flat_val_data[0])

HBox(children=(FloatProgress(value=0.0, max=11264.0), HTML(value='')))




In [199]:
test_gnn_embs,not_found_ids_test = get_gnn_embeddings(flat_test_data[0])

HBox(children=(FloatProgress(value=0.0, max=11264.0), HTML(value='')))




In [184]:

train_X = np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([meta_features[i],all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [185]:
# First we run the Random Forest with only the metadata/handcrafted features...
baseline_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
baseline_mod.fit(train_X[:, :263], train_Y)

RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)

In [186]:
print(roc_auc_score(val_Y, baseline_mod.predict_proba(val_X[:, :263])[:,1]))

0.6800026501149354


In [187]:
roc_auc_score(test_Y, baseline_mod.predict_proba(test_X[:, :263])[:,1])


0.6669488484136848

In [188]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)

In [190]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.7500665926317096

In [200]:

train_X = np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]], train_gnn_embs[i]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]], val_gnn_embs[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([meta_features[i],all_embeds[ids[i]], lstm_embeds[lstm_ids[i]], test_gnn_embs[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [204]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)

In [205]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.7498650674080517

In [206]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=700, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(n_estimators=700, n_jobs=-1, random_state=0)

In [207]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.7512728317594433

In [208]:

train_X = np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], train_gnn_embs[i]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], val_gnn_embs[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([meta_features[i],all_embeds[ids[i]], test_gnn_embs[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [209]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)

In [210]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.7597763031187873

In [211]:

train_X = np.stack([np.concatenate([meta_features[i], train_gnn_embs[i]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([meta_features[i], val_gnn_embs[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([meta_features[i], test_gnn_embs[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [212]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)

In [213]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.7523408863226888

In [214]:

train_X = np.stack([np.concatenate([train_gnn_embs[i]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([val_gnn_embs[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([test_gnn_embs[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [215]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)

In [216]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.7169973324117793

In [218]:
lstm_embeds.shape

(256723, 900)

In [219]:
df_emb.values.shape

(34671, 64)

In [223]:

train_X = np.stack([np.concatenate([lstm_embeds[lstm_ids[i]]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([lstm_embeds[lstm_ids[i]]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([lstm_embeds[lstm_ids[i]]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [224]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)

In [225]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.49259302582939857

In [226]:

train_X = np.stack([np.concatenate([all_embeds[ids[i]], train_gnn_embs[i]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([all_embeds[ids[i]], val_gnn_embs[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([all_embeds[ids[i]], test_gnn_embs[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [227]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)

In [228]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.7449116337133449

In [None]:
[meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]], train_gnn_embs[i]]


In [229]:

train_X = np.stack([np.concatenate([all_embeds[ids[i]]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([all_embeds[ids[i]]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([all_embeds[ids[i]]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])


# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.744973129970179

In [233]:
train_X = np.stack([np.concatenate([all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])


# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.7440297667122266

## Level 1 Datasets
Dataset-1 D<sub>1,1</sub> = Only Meta-Features

Dataset-2 D<sub>1,2</sub> = All embeddings

Dataset-3 D<sub>1,3</sub> = LSTM-Embeddings

Dataset-4 D<sub>1,4</sub> = Node2Vec-Embeddings

## Level 2 Datasets
Dataset-6 D<sub>2,3</sub> = All embeddings + LSTM-Embeddings

Dataset-7 D<sub>2,4</sub> = All embeddings + Node2Vec-Embeddings

## Level 3 Datasets
Dataset-8 D<sub>3,3</sub> = All embeddings + Meta-Features + LSTM-Embeddings

Dataset-8 D<sub>3,4</sub> = All embeddings + Meta-Features + Node2Vec-Embeddings

## Complete
Dataset-9 D<sub>4,3</sub> = All embeddings + Meta-Features + Node2Vec-Embeddings + LSTM-Embeddings


| Dataset | ROC_AUC | Contains Graph Embedding |
|---|---|---|
|Level 1|||
|Meta-Features| 0.6800 | No |
|All embeddings| **0.7450** | No |
|LSTM-Embeddings| 0.4926 | No |
|Node2Vec-Embeddings| 0.7170 | Yes |
|Level 2|||
|All embeddings + LSTM-Embeddings| 0.7440 | No |
|All embeddings + Node2Vec-Embeddings| 0.7450 | Yes |
|Meta-Features + Node2Vec-Embeddings| **0.7523** | Yes |
|Level 3|||
All embeddings + Meta-Features + LSTM-Embeddings| 0.7501 | No |
|All embeddings + Meta-Features + Node2Vec-Embeddings| **0.7598** | Yes |
|Level 4|||
|All available datasets| 0.7499 | No |