In [3]:
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd
from tqdm.notebook import tqdm

In [4]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
cd drive/MyDrive/gnn

The detailed data can be downloaded here: http://snap.stanford.edu/conflict/conflict_data.zip

In [15]:
DATA_DIR = './prediction'

In [16]:
# loading handcrafted features
meta_features = {}
meta_labels = {}
with open(DATA_DIR+"/detailed_data/handcrafted_features.tsv") as fp:
    for line in fp:
        info = line.split()
        meta_features[info[0]] = np.array(list(map(float, info[-1].split(","))))
        meta_labels[info[0]] = 1 if info[1] == "burst" else 0

In [17]:
# loading the user, source, and target community embeddings for all examples
with open(DATA_DIR + "/detailed_data/full_ids.txt") as fp:
    ids = {id.strip():i for i, id in enumerate(fp.readlines())}


In [18]:
# loading the post embeddings from the LSTM 
lstm_ids = pickle.load(open(DATA_DIR + "/detailed_data/lstm_embeds-ids.pkl",'rb'))
lstm_ids = {id:i for i, id in enumerate(lstm_ids)}

In [19]:
full_embeds_array = np.load('./prediction/detailed_data/full_embeds.npy')
full_embeds=pd.DataFrame(full_embeds_array)

In [20]:
lstm_embeds_array = np.load('./prediction/detailed_data/lstm_embeds.npy')
lstm_embeds=pd.DataFrame(lstm_embeds_array)

In [21]:

# loading preprocessed lstm data to ensure identical train/val/test splits
train_data = pickle.load(open(DATA_DIR + "/preprocessed_train_data.pkl",'rb'))
val_data = pickle.load(open(DATA_DIR + "/preprocessed_val_data.pkl",'rb'))
test_data = pickle.load(open(DATA_DIR + "/preprocessed_test_data.pkl",'rb'))

In [22]:
# flattening the preprocessed LSTM data (no need for minibatching here....)
def flatten(data):
    ids, text, users, subreddits, lengths,sfs, labels = [], [], [], [], [], [], []

    for batch in data:
        bids, btext, busers, bsubreddits, blengths, bsfs, blabels = batch
        ids.extend([x.decode('utf-8') for x in bids])
        text.extend(btext.numpy().tolist())
        users.extend(busers.numpy().tolist())
        subreddits.extend(bsubreddits.numpy().tolist())
        lengths.extend(blengths)
        labels.extend(blabels)
        sfs.extend(bsfs)
    return (ids, text, users, subreddits, lengths, labels)
flat_train_data = flatten(train_data)
flat_val_data = flatten(val_data)
flat_test_data = flatten(test_data)


In [23]:
lst_ids_parse = [(x.decode('utf-8'),y) for x,y in list(lstm_ids.items())]

In [24]:
lstm_ids = dict(lst_ids_parse)

In [25]:
df = pd.read_csv('soc-redditHyperlinks-body.tsv',sep='\t')


In [26]:
df['post_id_cropped'] = df['POST_ID'].apply(lambda x: x[:-1] if len(x) == 7 else x)

In [27]:
df_post_idx = df.set_index('post_id_cropped')
df_post_idx = df_post_idx.drop('POST_ID', 1)

In [19]:
####RUN THE CELLS UNTIL HERE

In [20]:
df_emb = pd.read_csv('./node2vec-embeddings.csv')

In [21]:
train_X = np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], lstm_embeds.iloc[lstm_ids[i]]]) for i in flat_train_data[0]])
val_X =  np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], lstm_embeds.iloc[lstm_ids[i]]]) for i in flat_val_data[0] if i in meta_features])
test_X =  np.stack([np.concatenate([meta_features[i],full_embeds.iloc[ids[i]], lstm_embeds.iloc[lstm_ids[i]]]) for i in flat_test_data[0] if i in meta_features])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if i in meta_features])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if i in meta_features])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if i in meta_features])

KeyboardInterrupt: ignored

In [None]:
# First we run the Random Forest with only the metadata/handcrafted features...
baseline_mod = RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)
# note that the first 263 features are the handcrafted ones... 
baseline_mod.fit(train_X[:, :263], train_Y)

In [None]:
# For reference, on the authors server we get 0.682
print(roc_auc_score(val_Y, baseline_mod.predict_proba(val_X[:, :263])[:,1]))

In [None]:
roc_auc_score(test_Y, baseline_mod.predict_proba(test_X[:, :263])[:,1])


In [None]:
# hand-crafted+full_embeds+lstm_embeds
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)

ensemble_mod.fit(train_X[:, :], train_Y)

In [None]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


In [None]:
""""with open(DATA_DIR+'/embeddings/sub_vecs.vocab') as fh:
      contents = fh.read()
      subreddit_names=contents.split(" ")
      print(len(subreddit_names))""""

In [None]:
""""
x = pd.DataFrame(subreddit_names)
y = pd.DataFrame(df_emb)

x.columns=['subreddit']
df_emb = pd.concat([x, y], axis=1, join="inner")
print(df_emb)"""

In [None]:
train_gnn_embs = list()
not_found_ids_train = list()
for x in tqdm(flat_train_data[0]):
    chunk = df_post_idx.loc[x]
    if not isinstance(chunk,pd.core.series.Series):
        source = chunk.SOURCE_SUBREDDIT.iloc[0]

    else:
        source = chunk.SOURCE_SUBREDDIT
    try:
        train_gnn_embs.append(df_emb.loc[source].values.tolist())
    except KeyError:
        not_found_ids_train.append(x)
        
        

In [None]:
def get_gnn_embeddings(ids_list):
    gnn_embs_source = {}
    gnn_embs_target = {}

    not_found_ids = list()
    for x in tqdm(ids_list):

        chunk = df_post_idx.loc[x]
        if not isinstance(chunk,pd.core.series.Series):
            source = chunk.SOURCE_SUBREDDIT.iloc[0]
            target = chunk.TARGET_SUBREDDIT.iloc[0]
        else:
            source = chunk.SOURCE_SUBREDDIT
            target = chunk.TARGET_SUBREDDIT

        try:
            gnn_embs_source[x]=df_emb[df_emb['subreddit']==source].values.tolist()[0][:-1]
            gnn_embs_target[x]=df_emb[df_emb['subreddit']==target].values.tolist()[0][:-1]
        except IndexError:
            not_found_ids.append(x)
            if x =='3c4rod':
              print("dddd")

        

    return gnn_embs_source,gnn_embs_target,not_found_ids

In [None]:
train_gnn_embs_source,train_gnn_embs_target,not_found_ids_train = get_gnn_embeddings(flat_train_data[0])

HBox(children=(FloatProgress(value=0.0, max=93696.0), HTML(value='')))

dddd



In [None]:
val_gnn_embs_source,val_gnn_embs_target,not_found_ids_val = get_gnn_embeddings(flat_val_data[0])

In [None]:
test_gnn_embs_source,test_gnn_embs_target,not_found_ids_test = get_gnn_embeddings(flat_test_data[0])


HBox(children=(FloatProgress(value=0.0, max=11264.0), HTML(value='')))




In [None]:
dftrain_gnn_embs_source=pd.DataFrame.from_dict(train_gnn_embs_source).T
dfval_gnn_embs_source=pd.DataFrame.from_dict(val_gnn_embs_source).T
dftest_gnn_embs_source=pd.DataFrame.from_dict(test_gnn_embs_source).T

dftrain_gnn_embs_target=pd.DataFrame.from_dict(train_gnn_embs_target).T
dfval_gnn_embs_target=pd.DataFrame.from_dict(val_gnn_embs_target).T
dftest_gnn_embs_target=pd.DataFrame.from_dict(test_gnn_embs_target).T


In [None]:
train_X = np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], lstm_embeds.iloc[lstm_ids[i]]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], lstm_embeds.iloc[lstm_ids[i]]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], lstm_embeds.iloc[lstm_ids[i]]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])


In [None]:
# First we run the Random Forest with only the metadata/handcrafted features...
baseline_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
baseline_mod.fit(train_X[:, :263], train_Y)

In [None]:
print(roc_auc_score(val_Y, baseline_mod.predict_proba(val_X[:, :263])[:,1]))

In [None]:
roc_auc_score(test_Y, baseline_mod.predict_proba(test_X[:, :263])[:,1])


In [None]:
# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

In [None]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


In [None]:

train_X = np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], lstm_embeds.iloc[lstm_ids[i]], dftrain_gnn_embs_source.loc[i],dftrain_gnn_embs_target.loc[i]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], lstm_embeds.iloc[lstm_ids[i]], dfval_gnn_embs_source.loc[i],dfval_gnn_embs_target.loc[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([meta_features[i],full_embeds.iloc[ids[i]], lstm_embeds.iloc[lstm_ids[i]], dftest_gnn_embs_source.loc[i],dftest_gnn_embs_target.loc[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [None]:
 if '3c4rod' in not_found_ids_train:
   print("44")

44


In [None]:

ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


0.764818510810139

In [None]:

train_X2 = np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], dftrain_gnn_embs_source.loc[i],dftrain_gnn_embs_target.loc[i]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X2 =  np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], dfval_gnn_embs_source.loc[i],dfval_gnn_embs_target.loc[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X2 =  np.stack([np.concatenate([meta_features[i],full_embeds.iloc[ids[i]], dftest_gnn_embs_source.loc[i],dftest_gnn_embs_target.loc[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y2 = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y2 =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y2 =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

NameError: ignored

In [None]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X2[:, :], train_Y2)

In [None]:
roc_auc_score(val_Y2, ensemble_mod.predict_proba(val_X2[:, :])[:,1])
#without LSTM embeddings

In [None]:

train_X3 = np.stack([np.concatenate([meta_features[i], dftrain_gnn_embs_source.loc[i],dftrain_gnn_embs_target.loc[i]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X3 =  np.stack([np.concatenate([meta_features[i], dfval_gnn_embs_source.loc[i],dfval_gnn_embs_target.loc[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X3 =  np.stack([np.concatenate([meta_features[i], dftest_gnn_embs_source.loc[i],dftest_gnn_embs_target.loc[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y3 = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y3 =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y3 =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [None]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

In [None]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


In [None]:

train_X4 = np.stack([np.concatenate([dftrain_gnn_embs_source.loc[i],dftrain_gnn_embs_target.loc[i]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X4 =  np.stack([np.concatenate([dfval_gnn_embs_source.loc[i],dfval_gnn_embs_target.loc[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X4 =  np.stack([np.concatenate([ dftest_gnn_embs_source.loc[i],dftest_gnn_embs_target.loc[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y4 = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y4 =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y4 =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [None]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

In [None]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


In [None]:

train_X = np.stack([np.concatenate([lstm_embeds[lstm_ids[i]]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([lstm_embeds[lstm_ids[i]]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([lstm_embeds[lstm_ids[i]]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [None]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

In [None]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


In [None]:

train_X = np.stack([np.concatenate([all_embeds[ids[i]], train_gnn_embs[i]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([all_embeds[ids[i]], val_gnn_embs[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([all_embeds[ids[i]], test_gnn_embs[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

In [None]:

# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

In [None]:
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


In [None]:
[meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]], train_gnn_embs[i]]


In [None]:

train_X = np.stack([np.concatenate([all_embeds[ids[i]]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([all_embeds[ids[i]]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([all_embeds[ids[i]]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])


# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


In [None]:
train_X = np.stack([np.concatenate([all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_train_data[0] if (i not in not_found_ids_train)])
val_X =  np.stack([np.concatenate([all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_X =  np.stack([np.concatenate([all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in not_found_ids_train)])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in not_found_ids_val)])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in not_found_ids_test)])


# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)
roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])


In [None]:
###RUN THE FOLLOWING CELLS FROM HERE

In [28]:
embeddingTable = {}
with open("reddit_embed300.txt") as f:
    for line in f:
       key, *values = line.split() # fix here
       embeddingTable[key] = [float(value) for value in values]

del embeddingTable['35776']

edge2vec_embdf=pd.DataFrame.from_dict(embeddingTable).T
print(edge2vec_embdf)


                       0         1         2    ...       297       298       299
askreddit        -0.086266  0.062471 -0.013276  ...  0.112404  0.011235 -0.046906
iama             -0.075592  0.054094 -0.008563  ...  0.096936  0.010037 -0.041703
writingprompts   -0.081293  0.060407 -0.013268  ...  0.108694  0.008750 -0.047382
subredditdrama   -0.061043  0.040176 -0.009589  ...  0.078844  0.007312 -0.036665
tipofmypenis     -0.072520  0.053353 -0.011294  ...  0.099864  0.014381 -0.042081
...                    ...       ...       ...  ...       ...       ...       ...
redditsenddays    0.001452  0.001376 -0.001050  ... -0.000471  0.001173  0.000569
modelexchange    -0.000335 -0.000692 -0.001549  ...  0.001015  0.000228 -0.000401
gggg              0.000189 -0.000952  0.001454  ...  0.000060  0.000182 -0.000578
drawingsrevisted -0.001751  0.000403 -0.001609  ...  0.000678 -0.001433  0.000248
thisiswar        -0.001547  0.001518  0.000161  ...  0.000540  0.000097 -0.000574

[35776 rows x 3

In [29]:
only_user_emb=full_embeds.iloc[:,:300]

In [30]:
def get_gnn_embeddings2(ids_list,subreddit_embdf):
    gnn_embs_source = {}
    gnn_embs_target = {}

    not_found_ids = list()
    for x in tqdm(ids_list):

        chunk = df_post_idx.loc[x]
        if not isinstance(chunk,pd.core.series.Series):
            source = chunk.SOURCE_SUBREDDIT.iloc[0]
            target = chunk.TARGET_SUBREDDIT.iloc[0]
        else:
            source = chunk.SOURCE_SUBREDDIT
            target = chunk.TARGET_SUBREDDIT

        try:
            gnn_embs_source[x]=subreddit_embdf.loc[source].values.tolist()
            gnn_embs_target[x]=subreddit_embdf.loc[target].values.tolist()
        except IndexError:
            print(x)
            not_found_ids.append(x)


      
    return gnn_embs_source,gnn_embs_target,not_found_ids

In [31]:
train_edge2vec_embs_source,train_edge2vec_embs_target,unfound_ids_train = get_gnn_embeddings2(flat_train_data[0],edge2vec_embdf)


HBox(children=(FloatProgress(value=0.0, max=93696.0), HTML(value='')))




In [32]:
val_edge2vec_embs_source,val_edge2vec_embs_target,unfound_ids_val = get_gnn_embeddings2(flat_val_data[0],edge2vec_embdf)


HBox(children=(FloatProgress(value=0.0, max=11264.0), HTML(value='')))




In [35]:
test_edge2vec_embs_source,test_edge2vec_embs_target,unfound_ids_test = get_gnn_embeddings2(flat_test_data[0],edge2vec_embdf)

HBox(children=(FloatProgress(value=0.0, max=11264.0), HTML(value='')))




In [36]:
dftrain_edge2vec_embs_source=pd.DataFrame.from_dict(train_edge2vec_embs_source).T
dfval_edge2vec_embs_source=pd.DataFrame.from_dict(val_edge2vec_embs_source).T
dftest_edge2vec_embs_source=pd.DataFrame.from_dict(test_edge2vec_embs_source).T

dftrain_edge2vec_embs_target=pd.DataFrame.from_dict(train_edge2vec_embs_target).T
dfval_edge2vec_embs_target=pd.DataFrame.from_dict(val_edge2vec_embs_target).T
dftest_edge2vec_embs_target=pd.DataFrame.from_dict(test_edge2vec_embs_target).T

In [37]:
train_X6 = np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], dftrain_edge2vec_embs_source.loc[i],dftrain_edge2vec_embs_target.loc[i]]) for i in flat_train_data[0] if (i not in unfound_ids_train)])
val_X6 =  np.stack([np.concatenate([meta_features[i], full_embeds.iloc[ids[i]], dfval_edge2vec_embs_source.loc[i],dfval_edge2vec_embs_target.loc[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in unfound_ids_val)])
test_X6 =  np.stack([np.concatenate([meta_features[i],full_embeds.iloc[ids[i]], dftest_edge2vec_embs_source.loc[i],dftest_edge2vec_embs_target.loc[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in unfound_ids_test)])

train_Y6 = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in unfound_ids_train)])
val_Y6 =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in unfound_ids_val)])
test_Y6 =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in unfound_ids_test)])

In [38]:
# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X6[:, :], train_Y6)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [39]:
##Sensitivity Analysis
!pip install pytolemaic



In [42]:
import numpy
import sklearn.datasets
import sklearn.model_selection

# Initiating PyTrust
from pytolemaic.pytrust import PyTrust


# Initiating PyTrust with more information
from pytolemaic.pytrust import PyTrust
from pytolemaic.utils.dmd import DMD

pytrust = PyTrust(
model=ensemble_mod,
xtrain=train_X6, ytrain=train_Y6,
xtest=test_X6, ytest=test_Y6,
splitter='stratified',
metric='recall')

In [1]:
sensitivity_report = pytrust.sensitivity_report()

# use pprint for pretty print.
pprint(sensitivity_report.to_dict(), width=120)

NameError: ignored

In [None]:
####

In [None]:
roc_auc_score(val_Y6, ensemble_mod.predict_proba(val_X6[:, :])[:,1])


0.756972858122227

In [None]:
## meta_features + only_user_emb + edgevec source-target embs

In [None]:
train_X7 = np.stack([np.concatenate([meta_features[i], only_user_emb.iloc[ids[i]], dftrain_edge2vec_embs_source.loc[i],dftrain_edge2vec_embs_target.loc[i]]) for i in flat_train_data[0] if (i not in unfound_ids_train)])
val_X7 =  np.stack([np.concatenate([meta_features[i], only_user_emb.iloc[ids[i]], dfval_edge2vec_embs_source.loc[i],dfval_edge2vec_embs_target.loc[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in unfound_ids_val)])
test_X7 =  np.stack([np.concatenate([meta_features[i],only_user_emb.iloc[ids[i]], dftest_edge2vec_embs_source.loc[i],dftest_edge2vec_embs_target.loc[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in unfound_ids_test)])

train_Y7 = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in unfound_ids_train)])
val_Y7 =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in unfound_ids_val)])
test_Y7 =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in unfound_ids_test)])

In [None]:
# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X7[:, :], train_Y7)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
roc_auc_score(val_Y7, ensemble_mod.predict_proba(val_X7[:, :])[:,1])


0.7341682706769684

In [None]:
## only_user_emb + edgevec source-target embs

In [None]:
train_X8 = np.stack([np.concatenate([only_user_emb.iloc[ids[i]], dftrain_edge2vec_embs_source.loc[i],dftrain_edge2vec_embs_target.loc[i]]) for i in flat_train_data[0] if (i not in unfound_ids_train)])
val_X8 =  np.stack([np.concatenate([only_user_emb.iloc[ids[i]], dfval_edge2vec_embs_source.loc[i],dfval_edge2vec_embs_target.loc[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in unfound_ids_val)])
test_X8 =  np.stack([np.concatenate([only_user_emb.iloc[ids[i]], dftest_edge2vec_embs_source.loc[i],dftest_edge2vec_embs_target.loc[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in unfound_ids_test)])

train_Y8 = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in unfound_ids_train)])
val_Y8 =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in unfound_ids_val)])
test_Y8 =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in unfound_ids_test)])

In [None]:
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X8[:, :], train_Y8)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
roc_auc_score(val_Y8, ensemble_mod.predict_proba(val_X8[:, :])[:,1])


0.70841416315682

In [None]:
##   edgevec source-target embs

In [None]:
train_X9 = np.stack([np.concatenate([ dftrain_edge2vec_embs_source.loc[i],dftrain_edge2vec_embs_target.loc[i]]) for i in flat_train_data[0] if (i not in unfound_ids_train)])
val_X9 =  np.stack([np.concatenate([ dfval_edge2vec_embs_source.loc[i],dfval_edge2vec_embs_target.loc[i]]) for i in flat_val_data[0] if (i in meta_features) and (i not in unfound_ids_val)])
test_X9 =  np.stack([np.concatenate([ dftest_edge2vec_embs_source.loc[i],dftest_edge2vec_embs_target.loc[i]]) for i in flat_test_data[0] if (i in meta_features) and (i not in unfound_ids_test)])

train_Y9 = np.stack([meta_labels[i] for i in flat_train_data[0] if (i in meta_features) and (i not in unfound_ids_train)])
val_Y9 =  np.stack([meta_labels[i] for i in flat_val_data[0] if (i in meta_features) and (i not in unfound_ids_val)])
test_Y9 =  np.stack([meta_labels[i] for i in flat_test_data[0] if (i in meta_features) and (i not in unfound_ids_test)])

In [None]:
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X9[:, :], train_Y9)

In [None]:
roc_auc_score(val_Y9, ensemble_mod.predict_proba(val_X9[:, :])[:,1])


## Level 1 Datasets
Dataset-1 D<sub>1,1</sub> = Only Meta-Features

Dataset-2 D<sub>1,2</sub> = All embeddings

Dataset-3 D<sub>1,3</sub> = LSTM-Embeddings

Dataset-4 D<sub>1,4</sub> = Node2Vec-Embeddings

## Level 2 Datasets
Dataset-6 D<sub>2,3</sub> = All embeddings + LSTM-Embeddings

Dataset-7 D<sub>2,4</sub> = All embeddings + Node2Vec-Embeddings

## Level 3 Datasets
Dataset-8 D<sub>3,3</sub> = All embeddings + Meta-Features + LSTM-Embeddings

Dataset-8 D<sub>3,4</sub> = All embeddings + Meta-Features + Node2Vec-Embeddings

## Complete
Dataset-9 D<sub>4,3</sub> = All embeddings + Meta-Features + Node2Vec-Embeddings + LSTM-Embeddings


| Dataset | ROC_AUC | Contains Graph Embedding |
|---|---|---|
|Level 1|||
|Meta-Features| 0.6800 | No |
|All embeddings| **0.7450** | No |
|LSTM-Embeddings| 0.4926 | No |
|Node2Vec-Embeddings| 0.7170 | Yes |
|Level 2|||
|All embeddings + LSTM-Embeddings| 0.7440 | No |
|All embeddings + Node2Vec-Embeddings| 0.7450 | Yes |
|Meta-Features + Node2Vec-Embeddings| **0.7523** | Yes |
|Level 3|||
All embeddings + Meta-Features + LSTM-Embeddings| 0.7501 | No |
|All embeddings + Meta-Features + Node2Vec-Embeddings| **0.7598** | Yes |
|Level 4|||
|All available datasets| 0.7499 | No |