### Pre-process datasets  
Goal : make input files suitable for the algorithm.

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
# import wget

In [2]:
%matplotlib notebook

In [3]:
def reciprocal_edges(G):
    N=G.number_of_nodes()
    nodes=list(G.nodes())
    edges=G.edges()
    unique_edges=[]
    reciprocal_list=[]
    for idxi in range(N):
        i=nodes[idxi]
        for idxj in range(idxi+1,N):
            j=nodes[idxj]
            if (i,j) in edges or (j,i) in edges: 
                unique_edges.append((i,j))
                if (i,j) in edges and (j,i) in edges: 
                    reciprocal_list.append((i,j))

    unique_edges = list(set(unique_edges))
    reciprocal = len(reciprocal_list)
 
    print("reciprocal:", reciprocal,len(unique_edges),"reciprocity:",reciprocal/float(len(unique_edges)),reciprocal/float(len(unique_edges)+reciprocal)) 
    return  unique_edges, reciprocal_list


In [4]:
!pwd

/Users/hsafdari/Dropbox/Dr_Caterina_de_Bacco/anomaly detection/data/input


In [5]:
!ls

[34mRD[m[m                              pre-process_dataset.ipynb
data_description.md             pre-process_facebookwaall.ipynb
dutch2.dat                      syn_500_3_20.0_0.8_True_0.dat
dutch6.dat                      [34msynthetic[m[m
facebook-wosn-wall.dat          theta_500_3_20.0_0.8_True_0.npz
pok_month0.dat                  twitter.dat
pre-process_WikiTalkht.ipynb    wiki_talk_ht.dat
pre-process_dataset-POK0.ipynb


In [32]:
outnet='pok_month0'
input_data = pd.read_csv(outnet+'.dat', skiprows=[0], header=None, sep='\s+')

In [33]:
df = pd.DataFrame(input_data)
df.head(n=2)

Unnamed: 0,0,1,2
0,0,1,4
1,0,2,1


In [34]:
df[2] = 1
df.head(2)

Unnamed: 0,0,1,2
0,0,1,1
1,0,2,1


### Remove 'E' from first column

In [35]:
# df=df.drop(0,axis=1)
# df.columns=[0,1,2]


In [36]:
df.head(n=2)

Unnamed: 0,0,1,2
0,0,1,1
1,0,2,1


In [37]:
df[2].unique()

array([1])

In [38]:
len(df)

18098

### Keep only some of the edges, based on the third column.  
For POK dataset, keep only one month of data.

In [39]:
# onemonth = 60*60*24*30
# df=input_data[input_data[2]<onemonth]
# len(df)

### Keep only nodes that have at least one incoming and one outgoing edge

In [40]:
# ego=input_data[0].unique()
# alter=input_data[1].unique()
# df=input_data[input_data[0].isin(nodes)]

ego=df[0].unique()
alter=df[1].unique()
nodes = set(ego).intersection(set(alter))
print(len(nodes))

df=df[df[0].isin(nodes)]
df=df[df[1].isin(nodes)]
len(df)

3504


18026

### Build multigraph

In [41]:
edge_list=[(n[0],n[1],len(g)) for n,g in df.groupby(by=[0,1])]

In [42]:
G = nx.MultiDiGraph()
G.add_weighted_edges_from(edge_list) # or G.add_edges_from(edge_list)

In [43]:
G = nx.MultiDiGraph()
G.add_weighted_edges_from(df.values)

In [44]:
list(G.edges(data=True))[:5]

[(0, 1, {'weight': 1}),
 (0, 2, {'weight': 1}),
 (0, 3, {'weight': 1}),
 (0, 6, {'weight': 1}),
 (0, 7, {'weight': 1})]

### Remove self loops

In [45]:
G.remove_edges_from(list(nx.selfloop_edges(G)))
G.number_of_nodes(),G.number_of_edges()

(3504, 18026)

In [22]:
G.out_degree()

OutMultiDegreeView({0: 10, 1: 39, 2: 47, 3: 63, 6: 31, 7: 6, 9: 77, 10: 4, 14: 2, 72: 3, 492: 11, 5: 20, 27: 6, 34: 4, 47: 11, 65: 16, 77: 8, 78: 10, 84: 5, 94: 13, 104: 5, 117: 3, 127: 25, 130: 4, 134: 8, 144: 27, 174: 3, 181: 4, 184: 3, 395: 14, 566: 14, 567: 9, 702: 9, 829: 30, 960: 21, 990: 5, 1797: 6, 2020: 1, 2271: 15, 2282: 2, 2296: 2, 2308: 2, 2581: 4, 3256: 9, 4195: 2, 4: 402, 8: 9, 13: 6, 15: 13, 17: 18, 19: 3, 22: 4, 40: 10, 52: 7, 54: 10, 55: 23, 56: 6, 57: 11, 58: 11, 62: 61, 66: 9, 75: 10, 76: 3, 91: 2, 99: 1, 123: 10, 128: 6, 143: 30, 183: 16, 342: 31, 374: 17, 453: 2, 497: 7, 498: 7, 862: 9, 965: 19, 973: 27, 980: 5, 1890: 4, 2139: 15, 2785: 10, 3747: 71, 16: 8, 87: 2, 115: 1, 131: 9, 177: 23, 199: 4, 230: 2, 451: 3, 525: 4, 591: 11, 601: 7, 682: 9, 773: 15, 840: 2, 971: 5, 988: 41, 1045: 2, 1070: 8, 1658: 13, 1997: 3, 2021: 2, 2052: 3, 2630: 4, 2660: 29, 2689: 2, 2887: 6, 3836: 4, 5279: 12, 59: 3, 64: 7, 100: 14, 135: 3, 293: 6, 454: 1, 1459: 23, 2379: 1, 2407: 1, 2663

### Remove nodes that have few out- or in-coming edges

In [46]:
outdegree = [ d[1] for d in list(G.out_degree())]
indegree = [ d[1] for d in list(G.in_degree())]

np.percentile(outdegree,50)

2.0

In [47]:
G1=nx.MultiDiGraph(G)

In [48]:
threshold=0

Gnodes = list(G.nodes())
nodes_to_remove=[n for n in Gnodes if G.out_degree(n)<threshold]
nodes_to_remove.extend([n for n in Gnodes if G.in_degree(n)<threshold])
nodes_to_remove=list(set(nodes_to_remove))
len(nodes_to_remove),len(Gnodes)

G1.remove_nodes_from(list(nodes_to_remove))
print(len(nodes_to_remove))

0


In [49]:
G1.number_of_nodes(),G1.number_of_edges()

(3504, 18026)

In [27]:
ncc = nx.number_weakly_connected_components(G1)
cc = list(nx.weakly_connected_components(G))
for c in range(ncc):
    print(c,len(cc[c]))

0 3433
1 2
2 2
3 2
4 2
5 2
6 4
7 2
8 2
9 2
10 2
11 2
12 2
13 2
14 2
15 2
16 2
17 2
18 2
19 2
20 2
21 2
22 2
23 2
24 2
25 2
26 2
27 2
28 3
29 2
30 2
31 2
32 2
33 2
34 2


In [28]:
Gc = max(nx.weakly_connected_components(G1), key=len)
Gc = cc[3]
nodes_to_remove=set(G1.nodes()).difference(Gc)
print("removed" ,len(nodes_to_remove),' nodes')
G1.remove_nodes_from(list(nodes_to_remove))
print(G1.number_of_nodes(),G1.number_of_edges())

removed 3502  nodes
2 2


In [50]:
unique_edges, reciprocal_list = reciprocal_edges(G1)

reciprocal: 7052 10974 reciprocity: 0.6426098049936213 0.39121269277709975


In [51]:
G1nodes = list(G1.nodes())
pos = nx.spring_layout(G1)

In [52]:
# subG = np.random.choice(G1nodes,400)
# k = G1.subgraph(subG)  
k =G1

plt.figure()
nx.draw_networkx(k, pos=pos,with_labels=False,node_size=10)

<IPython.core.display.Javascript object>

In [594]:

# nx.draw(G1)

### Output on file

In [53]:
outfile = 'RD/'+outnet+'.dat'
# nx.write_weighted_edgelist(G, '../data/input/'+outnet+'.dat')
nx.write_weighted_edgelist(G, outfile)
print(outfile)

RD/pok_month0.dat
