### Pre-process datasets  
Goal : make input files suitable for the algorithm.

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
# import wget

In [2]:
%matplotlib notebook

In [3]:
def reciprocal_edges(G):
    N=G.number_of_nodes()
    nodes=list(G.nodes())
    edges=G.edges()
    unique_edges=[]
    reciprocal_list=[]
    for idxi in range(N):
        i=nodes[idxi]
        for idxj in range(idxi+1,N):
            j=nodes[idxj]
            if (i,j) in edges or (j,i) in edges: 
                unique_edges.append((i,j))
                if (i,j) in edges and (j,i) in edges: 
                    reciprocal_list.append((i,j))

    unique_edges = list(set(unique_edges))
    reciprocal = len(reciprocal_list)
 
    print("reciprocal:", reciprocal,len(unique_edges),"reciprocity:",reciprocal/float(len(unique_edges)),reciprocal/float(len(unique_edges)+reciprocal)) 
    return  unique_edges, reciprocal_list


In [4]:
!pwd

/Users/hsafdari/Documents/Dr_Caterina_de_Bacco/anomaly_detection/data/input


In [5]:
!ls

[34mRD[m[m
[34mWikipedia_vote_network [m[m
[34m__pycache__[m[m
adjacency_highschool_T1_1.dat
adjacency_highschool_T1_1.png
adjacency_highschool_T1_2.dat
adjacency_highschool_T2.dat
airport.csv
airports.dat
data_description.md
dutch2.dat
dutch6.dat
facebook-wosn-wall.dat
pok_month0.dat
[34mpolblogs[m[m
[34mpolbooks[m[m
pre-process_WikiTalkht.ipynb
pre-process_dataset-POK0.ipynb
pre-process_dataset-airports.ipynb
pre-process_dataset-polblogs.ipynb
pre-process_dataset-polbooks.ipynb
pre-process_dataset-rewiring_polbooks.ipynb
pre-process_dataset.ipynb
pre-process_dataset_pok_month0.ipynb
pre-process_facebookwaall.ipynb
syn_500_3_20.0_0.8_True_0.dat
[34msynthetic[m[m
test.ipynb
theta_500_3_20.0_0.8_True_0.npz
tools.py
twitter.dat
u.dat
u_syn.dat
v.dat
wiki_talk_ht.dat


In [6]:
outnet=''
input_data = pd.read_csv(outnet+'adjacency_highschool_T1_1.dat', header=None, skiprows=0, sep='\s+')

In [7]:
df = pd.DataFrame(input_data)
df.head(n=2)

Unnamed: 0,0,1,2
0,3,4,1
1,3,17,1


### Remove 'E' from first column

In [8]:
df[2].unique()

array([1])

In [9]:
len(df)

100

### Keep only some of the edges, based on the third column.  
For POK dataset, keep only one month of data.

In [10]:
# onemonth = 60*60*24*30
# df=input_data[input_data[2]<onemonth]
# len(df)

### Keep only nodes that have at least one incoming and one outgoing edge

In [11]:
# ego=input_data[0].unique()
# alter=input_data[1].unique()
# df=input_data[input_data[0].isin(nodes)]

ego=df[0].unique()
alter=df[1].unique()
nodes = set(ego).intersection(set(alter))
print(len(nodes))

df=df[df[0].isin(nodes)]
df=df[df[1].isin(nodes)]
len(df)

31


100

### Build multigraph

In [12]:
edge_list=[(n[0],n[1],len(g)) for n,g in df.groupby(by=[0,1])]

In [13]:
G = nx.DiGraph()
G.add_weighted_edges_from(edge_list) # or G.add_edges_from(edge_list)

In [14]:
list(G.edges(data=True))[:5]

[(3, 4, {'weight': 1}),
 (3, 17, {'weight': 1}),
 (3, 18, {'weight': 1}),
 (3, 42, {'weight': 1}),
 (4, 18, {'weight': 1})]

### Remove self loops

In [15]:
G.remove_edges_from(list(nx.selfloop_edges(G)))
G.number_of_nodes(),G.number_of_edges()

(31, 100)

In [16]:
G.out_degree()

OutDegreeView({3: 4, 4: 2, 17: 3, 18: 5, 42: 3, 10: 4, 49: 4, 51: 5, 52: 2, 15: 3, 40: 4, 26: 3, 25: 6, 31: 4, 34: 2, 35: 4, 39: 2, 41: 4, 36: 2, 29: 2, 47: 4, 30: 2, 32: 4, 61: 2, 67: 4, 43: 1, 46: 4, 57: 2, 60: 2, 64: 3, 59: 4})

### Remove nodes that have few out- or in-coming edges

In [17]:
outdegree = [ d[1] for d in list(G.out_degree())]
indegree = [ d[1] for d in list(G.in_degree())]

np.percentile(outdegree,50)

3.0

In [18]:
G1=nx.DiGraph(G)

In [19]:
threshold=0

Gnodes = list(G.nodes())
nodes_to_remove=[n for n in Gnodes if G.out_degree(n)<threshold]
nodes_to_remove.extend([n for n in Gnodes if G.in_degree(n)<threshold])
nodes_to_remove=list(set(nodes_to_remove))
len(nodes_to_remove),len(Gnodes)

G1.remove_nodes_from(list(nodes_to_remove))
print(len(nodes_to_remove))

0


In [20]:
G1.number_of_nodes(),G1.number_of_edges()

(31, 100)

In [33]:
ncc = nx.number_weakly_connected_components(G1)
cc = list(nx.weakly_connected_components(G))
for c in range(ncc):
    print(c,len(cc[c]))

0 3430
1 2
2 2
3 2
4 2
5 2
6 4
7 2
8 2
9 2
10 2
11 2
12 2
13 2
14 2
15 2
16 2
17 2
18 2
19 2
20 2
21 2
22 2
23 2
24 2
25 2
26 2
27 2
28 3
29 2
30 2
31 2
32 2
33 2
34 2


In [34]:
Gc = max(nx.weakly_connected_components(G1), key=len)
Gc = cc[3]
nodes_to_remove=set(G1.nodes()).difference(Gc)
print("removed" ,len(nodes_to_remove),' nodes')
G1.remove_nodes_from(list(nodes_to_remove))
print(G1.number_of_nodes(),G1.number_of_edges())

removed 3499  nodes
2 2


In [18]:
unique_edges, reciprocal_list = reciprocal_edges(G1)

reciprocal: 26 74 reciprocity: 0.35135135135135137 0.26


In [81]:
# G1nodes = list(G1.nodes())
# pos = nx.spring_layout(G1)
# subG = np.random.choice(G1nodes,400)
# k = G1.subgraph(subG)  
# k =G1

# plt.figure()
# nx.draw_networkx(k, pos=pos,with_labels=False,node_size=10)

In [20]:

# nx.draw(G1)

### Output on file

In [22]:
outfile = 'RD/adjacency_highschool_T1_1.dat'
# nx.write_weighted_edgelist(G, '../data/input/'+outnet+'.dat')
nx.write_weighted_edgelist(G1, outfile)
print(outfile)

RD/adjacency_highschool_T1_1.dat


In [20]:
G2=nx.DiGraph(G1)

In [21]:
# print([val for (node, val) in G2.degree()])

In [22]:
degree_sort = sorted(G2.degree, key=lambda x: x[1], reverse=False)

In [23]:
degree_sort

[(43, 2),
 (4, 3),
 (26, 4),
 (29, 4),
 (30, 4),
 (57, 4),
 (3, 5),
 (34, 5),
 (36, 5),
 (60, 5),
 (10, 6),
 (15, 6),
 (31, 6),
 (39, 6),
 (32, 6),
 (61, 6),
 (59, 6),
 (17, 7),
 (52, 7),
 (25, 7),
 (35, 7),
 (46, 7),
 (49, 8),
 (47, 8),
 (64, 8),
 (18, 9),
 (40, 9),
 (41, 9),
 (51, 10),
 (67, 10),
 (42, 11)]

In [25]:
list_num  = int(len(G2.nodes())*0.15)
print(list_num)
print('='*30)
node_list = []
for i in range(list_num): 
    node_list.append(degree_sort[i][0])
print(node_list)

4
[43, 4, 26, 29]


In [24]:
node_list = [43,4]

In [26]:
###  add edges randomely
# import random
# num_edge = 1000
# for rn_node in node_list:
#     set_difference = set(G2.nodes()) - set([rn_node])
#     list_difference = list(set_difference)
#     rn_list = random.sample(set_difference, num_edge)
#     l = [rn_node] * len(rn_list)
#     rn_edge  = list(zip(l,rn_list))
#     rn_edger = list(zip(rn_list,l))
#     G2.add_edges_from(rn_edge)
#     G2.add_edges_from(rn_edger)
# #     print(len(G2.edges()))

In [25]:
### add edges between the selected nodes
import random
added_edges = {}
num_edge = 10
for rn_node in node_list:
    set_difference = set(G2.nodes()) - set([rn_node])
    list_difference = list(set_difference)
    rn_list = random.sample(set_difference, num_edge)
    l = [rn_node] * len(rn_list)
    rn_edge  = list(zip(l,rn_list))
    rn_edger = list(zip(rn_list,l))
    G2.add_edges_from(rn_edge, weight=1)
#     G2.add_edges_from(rn_edger)
    added_edges[rn_node] = rn_edge
#     print(len(G2.edges()))

In [41]:
rn_edge

[(4, 10),
 (4, 64),
 (4, 31),
 (4, 3),
 (4, 41),
 (4, 59),
 (4, 17),
 (4, 26),
 (4, 60),
 (4, 51)]

In [26]:
len(G2.edges())-len(G1.edges())

20

In [27]:
added_edges

{43: [(43, 40),
  (43, 25),
  (43, 26),
  (43, 49),
  (43, 15),
  (43, 35),
  (43, 59),
  (43, 17),
  (43, 51),
  (43, 3)],
 4: [(4, 10),
  (4, 64),
  (4, 31),
  (4, 3),
  (4, 41),
  (4, 59),
  (4, 17),
  (4, 26),
  (4, 60),
  (4, 51)]}

In [29]:
G2.number_of_nodes(),G2.number_of_edges()

(31, 120)

In [30]:
unique_edges, reciprocal_list = reciprocal_edges(G2)

reciprocal: 27 93 reciprocity: 0.2903225806451613 0.225


In [31]:
unique_edges, reciprocal_list = reciprocal_edges(G1)

reciprocal: 26 74 reciprocity: 0.35135135135135137 0.26


In [32]:
import tools as tl
import importlib

In [33]:
importlib.reload(tl) 
network = 'RD/adjacency_highschool_T1_1.dat' 

A, B, B_T, data_T_vals = tl.import_data(network,header=0)
nodes = A[0].nodes()
pos = nx.spring_layout(A[0]) 
N = len(nodes)
L = B.shape[0] 
print(N)

RD/adjacency_highschool_T1_1.dat shape: (100, 3)
Number of nodes = 31
Number of layers = 1
Number of edges and average degree in each layer:
E[0] = 100 - <k> = 6.452
M[0] = 100 - <k_weighted> = 6.452
Reciprocity (networkX) = 0.52
Reciprocity (intended as the proportion of bi-directional edges over the unordered pairs) = 0.351
Reciprocity (considering the weights of the edges) = 0.52
31


In [34]:
outfile = 'RD/adjacency_highschool_T1_1_injected.dat'
# nx.write_weighted_edgelist(G, '../data/input/'+outnet+'.dat')
nx.write_weighted_edgelist(G2, 'RD/adjacency_highschool_T1_1_injected.dat') 
print(outfile)

RD/adjacency_highschool_T1_1_injected.dat


In [37]:
importlib.reload(tl) 
network = 'RD/adjacency_highschool_T1_1_injected.dat' 
print(network)
A0, B, B_T, data_T_vals = tl.import_data(network,header=0)
nodes = A0[0].nodes()
pos = nx.spring_layout(A0[0]) 
N = len(nodes)
L = B.shape[0] 
print(N)

RD/adjacency_highschool_T1_1_injected.dat
RD/adjacency_highschool_T1_1_injected.dat shape: (120, 3)
Number of nodes = 31
Number of layers = 1
Number of edges and average degree in each layer:
E[0] = 120 - <k> = 7.742
M[0] = 120 - <k_weighted> = 7.742
Reciprocity (networkX) = 0.45
Reciprocity (intended as the proportion of bi-directional edges over the unordered pairs) = 0.29
Reciprocity (considering the weights of the edges) = 0.45
31


In [38]:
# G1nodes = list(A0[0].nodes())
# pos = nx.spring_layout(A0[0])
# k =A0[0]

# plt.figure()
# nx.draw_networkx(k, pos=pos,with_labels=True,node_size=10)

In [39]:
# G1nodes = list(A[0].nodes())
# pos = nx.spring_layout(A[0])
# k =A[0]

# plt.figure()
# nx.draw_networkx(k, pos=pos,with_labels=True,node_size=10)

In [40]:
G1nodes = list(G1.nodes())
pos = nx.spring_layout(G1)
k =G1

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(18, 12))
# ax1.subplot(211)
nx.draw_networkx(k, pos=pos,with_labels=True,node_size=100, ax=ax1)

G1nodes = list(G2.nodes())
pos = nx.spring_layout(G2)
k2 =G2
# ax2.subplot(212)
nx.draw_networkx(k2, pos=pos,with_labels=True,node_size=100, ax=ax2)
fig.savefig('adjacency_highschool_T1_1.png', dpi=300)

<IPython.core.display.Javascript object>

In [84]:
G2.edges(43)

OutEdgeDataView([(43, 46), (43, 40), (43, 42), (43, 30), (43, 36), (43, 15), (43, 32), (43, 34), (43, 4), (43, 60), (43, 64), (43, 35), (43, 57), (43, 59), (43, 67)])

In [54]:
G1nodes = list(G2.nodes())
pos = nx.spring_layout(G2)

In [55]:
# subG = np.random.choice(G1nodes,400)
# k = G1.subgraph(subG)  
k =G2

plt.figure()
nx.draw_networkx(k, pos=pos,with_labels=False,node_size=10)

<IPython.core.display.Javascript object>

[(4, 22),
 (30, 22),
 (57, 21),
 (26, 20),
 (29, 19),
 (43, 19),
 (42, 14),
 (51, 13),
 (67, 13),
 (49, 12),
 (40, 12),
 (41, 12),
 (64, 12),
 (18, 11),
 (32, 11),
 (17, 10),
 (52, 10),
 (25, 10),
 (35, 10),
 (31, 9),
 (47, 9),
 (46, 9),
 (60, 9),
 (59, 9),
 (3, 8),
 (10, 8),
 (15, 8),
 (36, 8),
 (61, 8),
 (34, 6),
 (39, 6)]

In [44]:
len(G2.edges())

362497

In [45]:
len(G1.edges())

18022