In [1]:
group_num = 9
import os
import json
import subprocess
from collections import defaultdict
import random

In [31]:
# for demonstration:
!curl -s https://api.github.com/repos/1250326/exercise_complex_network/contents/Datasets/Group9/Facebook-Ego | head -n 20
# !curl -s https://api.github.com/repos/1250326/exercise_complex_network/contents/Datasets/Group9/Twitter-Ego

[
  {
    "name": "3437_2.edges",
    "path": "Datasets/Group9/Facebook-Ego/3437_2.edges",
    "sha": "dfa336a00b692fc39b3d553f577bfb41f68e95b3",
    "size": 7340,
    "url": "https://api.github.com/repos/1250326/exercise_complex_network/contents/Datasets/Group9/Facebook-Ego/3437_2.edges?ref=master",
    "html_url": "https://github.com/1250326/exercise_complex_network/blob/master/Datasets/Group9/Facebook-Ego/3437_2.edges",
    "git_url": "https://api.github.com/repos/1250326/exercise_complex_network/git/blobs/dfa336a00b692fc39b3d553f577bfb41f68e95b3",
    "download_url": "https://raw.githubusercontent.com/1250326/exercise_complex_network/master/Datasets/Group9/Facebook-Ego/3437_2.edges",
    "type": "file",
    "_links": {
      "self": "https://api.github.com/repos/1250326/exercise_complex_network/contents/Datasets/Group9/Facebook-Ego/3437_2.edges?ref=master",
      "git": "https://api.github.com/repos/1250326/exercise_complex_network/git/blobs/dfa336a00b692fc39b3d553f577bfb41f68e95b3

In [3]:
def download_files(group_num, folders = None):
    node_names = {}
    root_folder = f"Group{group_num}"
    if not os.path.exists(root_folder):
        os.mkdir(root_folder)
    else:
        print(f"Folder {root_folder} already exists")
        return
    if folders is None:
        folders = ['Facebook-Ego', 'Twitter-Ego']
    for folder in folders:
        os.mkdir(os.path.join(root_folder, folder))
        res = subprocess.run(["curl", "-s", f"https://api.github.com/repos/1250326/exercise_complex_network/contents/Datasets/Group{group_num}/{folder}"], stdout=subprocess.PIPE)
        for file_info in json.loads(res.stdout):
            os.system(f"wget -O {os.path.join(root_folder, folder, (fname:=file_info['name']))} {file_info['download_url']} -q")
            print(f"Downloaded file: {fname}")
            if '.' in fname:
                node_names[folder.split('-')[0]] = fname.split('.')[0]
        print(f"Downloaded folder: {folder}")
    return node_names

In [4]:
!rm -rf Group$group_num
node_names = download_files(group_num)
node_names

Downloaded file: 3437_2.edges
Downloaded file: 3437_2.egofeat
Downloaded file: 3437_2.feat
Downloaded file: 3437_2.featnames
Downloaded file: Description
Downloaded folder: Facebook-Ego
Downloaded file: 6408382.circles
Downloaded file: 6408382.edges
Downloaded file: 6408382.egofeat
Downloaded file: 6408382.feat
Downloaded file: 6408382.featnames
Downloaded file: Description
Downloaded folder: Twitter-Ego


{'Facebook': '3437_2', 'Twitter': '6408382'}

A
=

How many nodes and edges are there in the networks?

Manual calculation
------------------

### Facebook

Since facebook is an undirected graph, we use the `sorted` function to
avoid counting the same edge twice.

for instance, if we have an edge `(1, 2)` and `(2, 1)`, we will count it
as one edge. because after sorting, they will be `(1, 2)`.

In [5]:
facebook_edges = set()
with open(f"Group{group_num}/Facebook-Ego/{node_names['Facebook']}.edges", 'r') as f:
    for i in f:
        a,b = i.strip().split()
        facebook_edges.add(tuple(sorted((int(a), int(b)))))

list(facebook_edges)[:20]

[(3596, 3627),
 (3643, 3711),
 (3625, 3672),
 (3644, 3685),
 (3684, 3721),
 (3584, 3643),
 (3640, 3642),
 (3593, 3719),
 (3605, 3702),
 (3596, 3721),
 (3680, 3714),
 (3596, 3730),
 (3608, 3713),
 (3608, 3722),
 (3651, 3674),
 (3651, 3683),
 (3620, 3714),
 (3596, 3604),
 (3653, 3720),
 (3617, 3645)]

In [6]:
print(f"Number of edges in Facebook dataset: {len(facebook_edges)}")

Number of edges in Facebook dataset: 367


In [7]:
# number of nodes:
fnodes = set()
for edge in facebook_edges:
    fnodes.add(edge[0])
    fnodes.add(edge[1])
print(f"Number of nodes in Facebook dataset: {len(fnodes)}")

Number of nodes in Facebook dataset: 132


### Twitter

Twitter is a directed graph, so we don't need to sort the edges.

In [8]:
twiiter_edges = set()
with open(f"Group{group_num}/Twitter-Ego/{node_names['Twitter']}.edges", 'r') as f:
    for i in f:
        a,b = i.strip().split()
        twiiter_edges.add(tuple((int(a), int(b))))

list(twiiter_edges)[:20]

[(37665322, 177161911),
 (21808052, 809864),
 (80245884, 91585368),
 (23595873, 23739721),
 (21808052, 22139698),
 (16890327, 46422814),
 (89186018, 142450248),
 (16434310, 124444856),
 (462993829, 23739721),
 (16913834, 124444856),
 (17621204, 8362812),
 (15290966, 9224902),
 (35718360, 21755787),
 (21603325, 14844734),
 (16593859, 124886495),
 (14955344, 46422814),
 (14856594, 24844163),
 (37665322, 4620451),
 (14856594, 15290966),
 (177161911, 22139698)]

In [9]:
print(f"Number of edges in Twitter dataset: {len(twiiter_edges)}")

Number of edges in Twitter dataset: 3379


In [10]:
tnodes = set()
for edge in twiiter_edges:
    tnodes.add(edge[0])
    tnodes.add(edge[1])
print(f"Number of nodes in Twitter dataset: {len(tnodes)}")

Number of nodes in Twitter dataset: 151


Using NetworkX
--------------

### Facebook

In [11]:
import networkx as nx

In [12]:
facebook_graph = nx.read_edgelist(f"Group{group_num}/Facebook-Ego/{node_names['Facebook']}.edges", nodetype=int)
print("number of edges in facebook graph:", facebook_graph.number_of_edges())
print("number of nodes in facebook graph:", facebook_graph.number_of_nodes())

number of edges in facebook graph: 367
number of nodes in facebook graph: 132


### Twitter

In [13]:
twitter_graph = nx.read_edgelist(f"Group{group_num}/Twitter-Ego/{node_names['Twitter']}.edges", nodetype=int, create_using=nx.DiGraph)
print("number of edges in twitter graph:", twitter_graph.number_of_edges())
print("number of nodes in twitter graph:", twitter_graph.number_of_nodes())

number of edges in twitter graph: 3379
number of nodes in twitter graph: 151


B
=

What are the maximum degree and the average degree of the networks?

Manual calculation
------------------

### Facebook

In [14]:
facebook_degrees = defaultdict(int)
for a,b in facebook_edges:
    facebook_degrees[a] += 1
    facebook_degrees[b] += 1
facebook_degrees

defaultdict(int,
            {3596: 25,
             3627: 8,
             3643: 7,
             3711: 10,
             3625: 10,
             3672: 9,
             3644: 3,
             3685: 3,
             3684: 15,
             3721: 10,
             3584: 10,
             3640: 14,
             3642: 6,
             3593: 14,
             3719: 8,
             3605: 12,
             3702: 11,
             3680: 10,
             3714: 6,
             3730: 10,
             3608: 9,
             3713: 11,
             3722: 9,
             3651: 5,
             3674: 10,
             3683: 4,
             3620: 6,
             3604: 19,
             3653: 1,
             3720: 2,
             3617: 8,
             3645: 5,
             3687: 9,
             3731: 13,
             3635: 11,
             3667: 6,
             3670: 8,
             3692: 15,
             3693: 7,
             3586: 11,
             3606: 2,
             3662: 3,
             3669: 1,
             3700:

In [15]:
max_node, max_deg = max(facebook_degrees.items(), key=lambda x: x[1])
print(f"Node {max_node} has the highest degree of {max_deg} in Facebook dataset")

Node 3596 has the highest degree of 25 in Facebook dataset


In [16]:
print(f"Mean degree of nodes in Facebook dataset: {sum(facebook_degrees.values())/len(facebook_degrees):.4f}")

Mean degree of nodes in Facebook dataset: 5.5606


Also we know to find the the mean degree we have: $$
\bar{k} = \frac{\sum_{i=1}^{n} k_i}{N} = \frac{2E}{N}
$$

In [17]:
print(f"Median degree of nodes in Facebook dataset: {2*len(facebook_edges)/len(fnodes):.4f}")

Median degree of nodes in Facebook dataset: 5.5606


### Twitter

since twitter is a directed graph, we need to calculate the in-degree
and out-degree separately.

In [18]:
twiiter_in_degrees = defaultdict(int)
twiiter_out_degrees = defaultdict(int)

for a,b in twiiter_edges:
    twiiter_out_degrees[a] += 1
    twiiter_in_degrees[b] += 1

max_in_node, max_in_deg = max(twiiter_in_degrees.items(), key=lambda x: x[1])
max_out_node, max_out_deg = max(twiiter_out_degrees.items(), key=lambda x: x[1])

print(f"Node {max_in_node} has the highest in-degree of {max_in_deg} in Twitter dataset")
print(f"Node {max_out_node} has the highest out-degree of {max_out_deg} in Twitter dataset")
print(f"Mean in-degree of nodes in Twitter dataset: {sum(twiiter_in_degrees.values())/len(tnodes):.4f}")
print(f"Mean out-degree of nodes in Twitter dataset: {sum(twiiter_out_degrees.values())/len(tnodes):.4f}")
print(f"Mean degree of nodes in Twitter dataset: {(sum(twiiter_in_degrees.values())+sum(twiiter_out_degrees.values()))/len(tnodes):.4f}")
print(f"Median in-degree of nodes in Twitter dataset: {2*len(twiiter_edges)/len(tnodes):.4f}")

Node 16434310 has the highest in-degree of 83 in Twitter dataset
Node 24844163 has the highest out-degree of 72 in Twitter dataset
Mean in-degree of nodes in Twitter dataset: 22.3775
Mean out-degree of nodes in Twitter dataset: 22.3775
Mean degree of nodes in Twitter dataset: 44.7550
Median in-degree of nodes in Twitter dataset: 44.7550


### Using NetworkX

### Facebook

In [19]:
max_node, max_deg = max(dict(facebook_graph.degree()).items(), key=lambda x: x[1])
print(f"Node {max_node} has the highest degree of {max_deg} in Facebook dataset")
print(f"Mean degree of nodes in Facebook dataset: {sum(dict(facebook_graph.degree()).values())/len(dict(facebook_graph.degree())):.4f}")

Node 3596 has the highest degree of 25 in Facebook dataset
Mean degree of nodes in Facebook dataset: 5.5606


### Twitter

In [20]:
max_in_node, max_in_deg = max(dict(twitter_graph.in_degree()).items(), key=lambda x: x[1])
max_out_node, max_out_deg = max(dict(twitter_graph.out_degree).items(), key=lambda x: x[1])

print(f"Node {max_in_node} has the highest in-degree of {max_in_deg} in Twitter dataset")
print(f"Node {max_out_node} has the highest out-degree of {max_out_deg} in Twitter dataset")
print(f"Mean in-degree of nodes in Twitter dataset: {sum(dict(twitter_graph.in_degree()).values())/len(tnodes):.4f}")
print(f"Mean out-degree of nodes in Twitter dataset: {sum(dict(twitter_graph.out_degree()).values())/len(tnodes):.4f}")
print(f"Mean degree of nodes in Twitter dataset: {sum(dict(twitter_graph.degree()).values())/len(tnodes):.4f}")
print(f"Median in-degree of nodes in Twitter dataset: {2*len(twiiter_edges)/len(tnodes):.4f}")

Node 16434310 has the highest in-degree of 83 in Twitter dataset
Node 24844163 has the highest out-degree of 72 in Twitter dataset
Mean in-degree of nodes in Twitter dataset: 22.3775
Mean out-degree of nodes in Twitter dataset: 22.3775
Mean degree of nodes in Twitter dataset: 44.7550
Median in-degree of nodes in Twitter dataset: 44.7550


We can see all answers are consistent with each other.

C
=

Extract 5 - 8 nodes from the network and state them as a partial
network. What is the adjacency matrix of the partial network? Why do we
need adjacency matrix to describe the structure of the network?

In [21]:
n_nodes = 8

In [22]:
random.seed(0)
selected_facebook_nodes = random.sample(list(fnodes), n_nodes)
selected_facebook_nodes

[3692, 3702, 3595, 3654, 3731, 3722, 3697, 3666]

In [23]:
selected_twitter_nodes = random.sample(list(tnodes), n_nodes)
selected_twitter_nodes

[462993829,
 7623482,
 93006320,
 41385649,
 21808052,
 8904302,
 246691076,
 415643219]

Manual calculation
------------------

### Facebook

In [24]:
mini_facebook_adj = [[0]*n_nodes for _ in range(n_nodes)]
for i, node1 in enumerate(selected_facebook_nodes):
    for j, node2 in enumerate(selected_facebook_nodes):
        if (node1, node2) in facebook_edges or (node2, node1) in facebook_edges:
            mini_facebook_adj[i][j] = 1

print("Adjacency matrix of selected nodes in Facebook dataset:")
print("\n".join([" ".join(map(str, row)) for row in mini_facebook_adj]))

Adjacency matrix of selected nodes in Facebook dataset:
0 0 0 0 0 0 0 0
0 0 0 0 1 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 0 0 1 0
0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 1 0 0 0 0
0 0 0 0 0 0 0 0


### Twitter

In [25]:
mini_twitter_adj = [[0]*n_nodes for _ in range(n_nodes)]
for i, node1 in enumerate(selected_twitter_nodes):
    for j, node2 in enumerate(selected_twitter_nodes):
        if (node1, node2) in twiiter_edges:
            mini_twitter_adj[i][j] = 1

print("Adjacency matrix of selected nodes in Twitter dataset:")
print("\n".join([" ".join(map(str, row)) for row in mini_twitter_adj]))

Adjacency matrix of selected nodes in Twitter dataset:
0 0 1 0 0 0 0 0
0 0 0 0 0 0 0 0
0 0 0 0 1 1 0 0
0 0 1 0 0 1 0 0
0 0 1 0 0 1 1 0
0 0 1 1 0 0 0 0
0 0 0 0 1 0 0 0
0 0 0 0 0 0 0 0


Using NetworkX
--------------

In [26]:
mini_facebook_graph = nx.subgraph(facebook_graph, selected_facebook_nodes)
print("Adjacency matrix of selected nodes in Facebook dataset:")
print(nx.adjacency_matrix(mini_facebook_graph).todense())

Adjacency matrix of selected nodes in Facebook dataset:


[[0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 1 0]]


In [27]:
mini_twitter_graph = nx.subgraph(twitter_graph, selected_twitter_nodes)
print("Adjacency matrix of selected nodes in Twitter dataset:")
print(nx.adjacency_matrix(mini_twitter_graph).todense())

Adjacency matrix of selected nodes in Twitter dataset:
[[0 0 0 0 0 0 1 0]
 [0 0 0 1 0 0 0 0]
 [0 0 0 1 1 0 0 0]
 [0 0 1 0 0 0 1 0]
 [0 0 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [1 0 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


-   Why do we need adjacency matrix to describe the structure of the
    network?

    > The adjacency matrix is a simple way to represent the structure of
    > a graph. It is easy to understand and implement. It is also easy
    > to perform matrix operations on the adjacency matrix to analyze
    > the graph.