In [1]:
import os
import subprocess
import json
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import random

import networkx as nx
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import roc_auc_score, precision_score, roc_curve, accuracy_score
import matplotlib.pyplot as plt

from gensim.models import Word2Vec


In [2]:
# same function from HW3 to download files
group_num = 9
def download_files(group_num, folders = None, redownload = False):
    node_names = {}
    root_folder = f"Group{group_num}"
    root_folder_exist = os.path.exists(root_folder)
    if (not root_folder_exist) or redownload:
        os.system(f"rm -rf {root_folder}")
        os.mkdir(root_folder)
        if folders is None:
            folders = ['Facebook-Ego', 'Twitter-Ego']
        for folder in folders:
            os.mkdir(os.path.join(root_folder, folder))
            res = subprocess.run(["curl", "-s", f"https://api.github.com/repos/1250326/exercise_complex_network/contents/Datasets/Group{group_num}/{folder}"], stdout=subprocess.PIPE)
            for file_info in json.loads(res.stdout):
                os.system(f"wget -O {os.path.join(root_folder, folder, (fname:=file_info['name']))} {file_info['download_url']} -q")
                print(f"Downloaded file: {fname}")
                # if '.' in fname:
                    # node_names[folder.split('-')[0]] = fname.split('.')[0]
            print(f"Downloaded folder: {folder}")


    for folder in os.listdir(root_folder):
        for file in os.listdir(os.path.join(root_folder, folder)):
            if '.' in file:
                node_names[folder.split('-')[0]] = file.split('.')[0]
    return node_names

node_names = download_files(group_num, redownload=False)
node_names

{'Twitter': '6408382', 'Facebook': '3437_2'}

In [3]:
graph = nx.read_edgelist(f"Group{group_num}/Facebook-Ego/{node_names['Facebook']}.edges")


# Randomly delete 10% of edges


In [4]:
edges = list(graph.edges)
random.shuffle(edges)
num_test_edges = int(0.1 * len(edges))
test_edges = edges[num_test_edges]
train_graph = graph.copy()
train_graph.remove_edges_from(test_edges)

# Using DeepWalk to learn node embeddings

The following cell is taken from [here](https://github.com/prateekjoshi565/DeepWalk/blob/045eb56a3619b66795243f5aa0d152b28400f0e6/deepwalk_space.ipynb#L183)

In [5]:
def get_randomwalk(G, node, path_length):

    random_walk = [node]

    for i in range(path_length-1):
        temp = list(G.neighbors(node))
        temp = list(set(temp) - set(random_walk))
        if len(temp) == 0:
            break

        random_node = random.choice(temp)
        random_walk.append(random_node)
        node = random_node

    return random_walk

In [6]:
list(graph.nodes)[0]

'3710'

In [7]:
get_randomwalk(graph, list(graph.nodes)[0], 10)

['3710', '3713', '3663', '3728', '3636', '3590', '3629', '3599', '3666']

In [8]:
rnd_walk_len = 10
num_walks = 5

walks = []

for node in graph.nodes:
    for _ in range(num_walks):
        walks.append(get_randomwalk(graph, node, rnd_walk_len))

print(*walks[:5], sep='\n')

['3710', '3728', '3608', '3705', '3590', '3629', '3615', '3599', '3715', '3713']
['3710', '3599', '3722', '3728', '3636', '3590', '3713', '3629', '3698', '3598']
['3710', '3715', '3599', '3663', '3590', '3615', '3608', '3694']
['3710', '3636', '3713', '3663', '3728', '3698', '3629', '3705', '3599', '3615']
['3710', '3663', '3713', '3629', '3722', '3599', '3705', '3714', '3620', '3711']


# Train word2vec model


In [9]:
model = Word2Vec(window = 4, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(walks, progress_per=2)


In [10]:
model.train(walks, total_examples = model.corpus_count,
           epochs=20, report_delay=1)
model

<gensim.models.word2vec.Word2Vec at 0x7f8fd1ee7910>

# Find Top n Similar Nodes


In [11]:
model.wv.most_similar(list(graph.nodes)[0], topn=10)

[('3636', 0.9990555047988892),
 ('3713', 0.9985188841819763),
 ('3599', 0.9981085658073425),
 ('3722', 0.9967767000198364),
 ('3663', 0.9967679381370544),
 ('3590', 0.9963958859443665),
 ('3694', 0.9962424635887146),
 ('3715', 0.9947144985198975),
 ('3608', 0.9929696321487427),
 ('3629', 0.9918157458305359)]

In [12]:
model.wv[list(graph.nodes)[0]]

array([-1.69675708e-01,  1.64440572e-01, -3.70632619e-01, -2.99932688e-01,
        1.70151852e-02,  2.00876907e-01,  3.45358968e-01,  1.38574829e-02,
       -8.08339659e-03, -2.78697573e-02, -3.10589015e-01,  2.18853578e-01,
        6.52714807e-05, -1.46724001e-01, -1.58039585e-01,  4.10364389e-01,
       -5.40898666e-02,  3.29193532e-01, -6.42260388e-02, -3.07154536e-01,
        1.56725887e-02, -3.47159952e-01,  1.29870266e-01,  2.22610578e-01,
       -1.48977354e-01,  3.06456715e-01, -5.95184088e-01, -5.11458814e-01,
        9.86867905e-01, -5.25815785e-01, -1.84402987e-01, -7.41667300e-02,
        2.62872636e-01, -3.32830578e-01,  1.38979331e-01,  3.01736057e-01,
       -9.72141847e-02, -3.51948470e-01,  1.20284505e-01, -1.41472727e-01,
        5.37167013e-01, -4.90915000e-01, -2.78567020e-02, -2.98805267e-01,
        1.87011007e-02,  1.08337753e-01,  3.49218935e-01, -7.42919266e-01,
        3.77833322e-02,  5.11924684e-01,  7.49481171e-02, -4.31472838e-01,
       -7.99782932e-01, -

# Extract features for each edge

In [21]:
def extract_features(graph, edge):
    u, v = edge
    features = []
    features.append(np.sum(model.wv[str(u)] - model.wv[str(v)]))
    features.append(np.linalg.norm(model.wv[str(u)] - model.wv[str(v)]))
    return features
    # return np.concatenate([model.wv[str(u)], model.wv[str(v)]])


extract_features(graph, list(graph.edges)[0])

[0.20144765, 0.17744918]

In [22]:
# Positive samples: existing edges
X_pos = [extract_features(train_graph, edge) for edge in train_graph.edges]
y_pos = [1] * len(X_pos)

# Negative samples: non-edges
non_edges = list(nx.non_edges(train_graph))
random.shuffle(non_edges)
X_neg = [extract_features(train_graph, edge) for edge in non_edges[:len(X_pos)]]
y_neg = [0] * len(X_neg)

X = np.array(X_pos + X_neg)
y = np.array(y_pos + y_neg)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a logistic regression model

In [23]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)


## Evaluate the model

In [24]:
y_pred = model_lr.predict(X_test)
y_pred_prob = model_lr.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"AUROC: {auroc:.2f}")

Accuracy: 0.95
Precision: 0.95
AUROC: 0.99
