### Pre-process datasets  
Goal : make input files suitable for the algorithm.

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
# import wget

In [2]:
%matplotlib notebook

In [3]:
def reciprocal_edges(G):
    N=G.number_of_nodes()
    nodes=list(G.nodes())
    edges=G.edges()
    unique_edges=[]
    reciprocal_list=[]
    for idxi in range(N):
        i=nodes[idxi]
        for idxj in range(idxi+1,N):
            j=nodes[idxj]
            if (i,j) in edges or (j,i) in edges: 
                unique_edges.append((i,j))
                if (i,j) in edges and (j,i) in edges: 
                    reciprocal_list.append((i,j))

    unique_edges = list(set(unique_edges))
    reciprocal = len(reciprocal_list)
 
    print("reciprocal:", reciprocal,len(unique_edges),"reciprocity:",reciprocal/float(len(unique_edges)),reciprocal/float(len(unique_edges)+reciprocal)) 
    return  unique_edges, reciprocal_list


In [4]:
!pwd

/Users/hsafdari/Dropbox/Dr_Caterina_de_Bacco/anomaly detection/data/input


In [5]:
!ls

[34mRD[m[m                              pre-process_facebookwaall.ipynb
data_description.md             syn_500_3_20.0_0.8_True_0.dat
dutch2.dat                      [34msynthetic[m[m
dutch6.dat                      theta_500_3_20.0_0.8_True_0.npz
facebook-wosn-wall.dat          twitter.dat
pre-process_WikiTalkht.ipynb    wiki_talk_ht.dat
pre-process_dataset.ipynb


In [6]:
outnet='facebook-wosn-wall'
input_data = pd.read_csv(outnet+'.dat', header=None, sep='\s+')

In [7]:
df = pd.DataFrame(input_data)
df.head(n=2)

Unnamed: 0,0,1,2,3
0,1,1,1,1095135831
1,2,3,1,1097725406


In [8]:
# df[2] = 1
# df.head(2)

### Remove 'E' from first column

In [9]:
# df=df.drop(0,axis=1)
# df.columns=[0,1,2]


In [10]:
df.head(n=2)

Unnamed: 0,0,1,2,3
0,1,1,1,1095135831
1,2,3,1,1097725406


In [11]:
df.drop([3], axis=1, inplace=True)

In [12]:
df[2].unique()

array([1])

In [13]:
len(df)

876993

### Keep only some of the edges, based on the third column.  
For POK dataset, keep only one month of data.

In [14]:
# onemonth = 60*60*24*30
# df=input_data[input_data[2]<onemonth]
# len(df)

### Keep only nodes that have at least one incoming and one outgoing edge

In [15]:
# ego=input_data[0].unique()
# alter=input_data[1].unique()
# df=input_data[input_data[0].isin(nodes)]

ego=df[0].unique()
alter=df[1].unique()
nodes = set(ego).intersection(set(alter))
print(len(nodes))

df=df[df[0].isin(nodes)]
df=df[df[1].isin(nodes)]
len(df)

35424


768246

In [16]:
df.head()

Unnamed: 0,0,1,2
0,1,1,1
1,2,3,1
2,4,4,1
3,5,5,1
4,2,3,1


### Build multigraph

In [17]:
edge_list=[(n[0],n[1],len(g)) for n,g in df.groupby(by=[0,1])]

In [18]:
G = nx.MultiDiGraph()
G.add_weighted_edges_from(edge_list) # or G.add_edges_from(edge_list)

In [19]:
G = nx.MultiDiGraph()
G.add_weighted_edges_from(df.values)

In [20]:
list(G.edges(data=True))[:5]

[(1, 1, {'weight': 1}),
 (2, 3, {'weight': 1}),
 (2, 3, {'weight': 1}),
 (2, 3, {'weight': 1}),
 (2, 3, {'weight': 1})]

### Remove self loops

In [21]:
G.remove_edges_from(list(nx.selfloop_edges(G)))
G.number_of_nodes(),G.number_of_edges()

(35358, 746795)

In [43]:
G.out_degree()

OutMultiDegreeView({1: 0, 2: 105, 3: 8, 4: 0, 5: 417, 6: 10, 9: 54, 10: 92, 11: 0, 12: 1, 13: 104, 14: 61, 15: 42, 16: 102, 19: 49, 20: 25, 17: 141, 21: 178, 22: 35, 23: 118, 24: 65, 27: 172, 28: 139, 29: 98, 30: 18, 31: 28, 32: 30, 33: 11, 34: 16, 35: 22, 36: 52, 37: 32, 38: 24, 39: 22, 40: 83, 43: 3, 44: 106, 45: 81, 46: 65, 47: 31, 48: 34, 49: 56, 50: 35, 51: 9, 52: 13, 53: 7, 54: 74, 55: 3, 56: 15, 57: 26, 58: 47, 59: 78, 60: 10, 61: 22, 62: 81, 63: 13, 64: 69, 65: 126, 66: 17, 68: 79, 69: 9, 70: 191, 71: 73, 72: 69, 73: 11, 74: 16, 75: 67, 76: 70, 77: 25, 78: 17, 79: 7, 80: 44, 81: 41, 82: 73, 83: 7, 84: 75, 85: 25, 86: 40, 87: 47, 88: 38, 89: 102, 90: 24, 91: 37, 92: 62, 93: 19, 94: 0, 95: 148, 96: 5, 97: 10, 98: 69, 99: 33, 100: 12, 101: 13, 102: 3, 103: 9, 104: 15, 105: 4, 106: 11, 107: 37, 108: 13, 109: 42, 110: 90, 111: 10, 112: 5, 113: 6, 114: 8, 115: 40, 116: 20, 117: 3, 118: 19, 119: 27, 120: 0, 41: 15, 121: 17, 122: 2, 123: 102, 124: 66, 127: 4, 128: 13, 129: 43, 130: 20,

### Remove nodes that have few out- or in-coming edges

In [22]:
outdegree = [ d[1] for d in list(G.out_degree())]
indegree = [ d[1] for d in list(G.in_degree())]

np.percentile(outdegree,50)

7.0

In [23]:
G1=nx.MultiDiGraph(G)

In [24]:
threshold=2

Gnodes = list(G.nodes())
nodes_to_remove=[n for n in Gnodes if G.out_degree(n)<threshold]
nodes_to_remove.extend([n for n in Gnodes if G.in_degree(n)<threshold])
nodes_to_remove=list(set(nodes_to_remove))
len(nodes_to_remove),len(Gnodes)

G1.remove_nodes_from(list(nodes_to_remove))
print(len(nodes_to_remove))

9076


In [25]:
G1.number_of_nodes(),G1.number_of_edges()

(26282, 717339)

In [26]:
ncc = nx.number_weakly_connected_components(G1)
cc = list(nx.weakly_connected_components(G))
for c in range(ncc):
    print(c,len(cc[c]))

0 1
1 33148
2 1
3 1
4 1
5 1
6 3
7 1
8 1
9 1
10 2
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 2
22 1
23 1
24 1
25 1
26 1
27 1
28 1
29 1
30 1
31 1
32 1
33 1
34 1
35 1
36 2
37 1
38 1
39 1
40 1
41 2
42 1
43 1
44 1
45 1
46 1
47 1
48 1
49 1
50 1
51 3
52 1
53 1
54 1
55 1
56 1
57 1
58 3
59 1
60 1
61 1
62 1
63 1
64 1
65 2
66 2
67 1
68 1
69 1
70 1
71 2
72 1
73 1
74 1
75 1
76 1
77 1
78 1
79 3
80 1
81 1
82 2
83 1
84 3
85 1
86 1
87 1
88 1
89 1
90 1
91 1
92 1
93 1
94 1
95 1
96 1
97 1
98 1
99 1
100 4
101 2
102 1
103 1
104 1
105 1
106 1
107 1
108 1
109 1
110 1
111 1
112 1
113 1
114 1
115 1
116 1
117 1
118 1
119 1
120 1
121 1
122 1
123 1
124 1
125 2
126 1
127 1
128 1
129 1
130 1
131 1
132 1
133 1
134 3
135 1
136 1
137 1
138 2
139 1
140 1
141 1
142 1
143 1
144 1
145 2
146 1
147 1
148 2
149 1
150 1
151 1
152 1
153 1
154 1
155 1
156 1
157 1
158 1
159 2
160 1
161 1
162 1
163 1
164 2
165 1
166 1
167 1
168 1
169 2
170 1
171 1
172 1
173 1
174 1
175 1
176 1
177 1
178 2
179 2
180 1
181 1
182 1
183 1
18

In [27]:
Gc = max(nx.weakly_connected_components(G1), key=len)
Gc = cc[3]
nodes_to_remove=set(G1.nodes()).difference(Gc)
print("removed" ,len(nodes_to_remove),' nodes')
G1.remove_nodes_from(list(nodes_to_remove))
print(G1.number_of_nodes(),G1.number_of_edges())

removed 26282  nodes
0 0


In [28]:
unique_edges, reciprocal_list = reciprocal_edges(G1)

ZeroDivisionError: float division by zero

In [29]:
G1nodes = list(G1.nodes())
pos = nx.spring_layout(G1)

In [30]:
# subG = np.random.choice(G1nodes,400)
# k = G1.subgraph(subG)  
k =G1

plt.figure()
nx.draw_networkx(k, pos=pos,with_labels=False,node_size=10)

<IPython.core.display.Javascript object>

In [31]:

# nx.draw(G1)

### Output on file

In [32]:
outfile = 'RD/'+outnet+'.dat'
# nx.write_weighted_edgelist(G, '../data/input/'+outnet+'.dat')
nx.write_weighted_edgelist(G, outfile)
print(outfile)

RD/facebook-wosn-wall.dat
