### Pre-process datasets  
Goal : make input files suitable for the algorithm.

In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
# import wget

In [2]:
%matplotlib notebook

In [3]:
def reciprocal_edges(G):
    N=G.number_of_nodes()
    nodes=list(G.nodes())
    edges=G.edges()
    unique_edges=[]
    reciprocal_list=[]
    for idxi in range(N):
        i=nodes[idxi]
        for idxj in range(idxi+1,N):
            j=nodes[idxj]
            if (i,j) in edges or (j,i) in edges: 
                unique_edges.append((i,j))
                if (i,j) in edges and (j,i) in edges: 
                    reciprocal_list.append((i,j))

    unique_edges = list(set(unique_edges))
    reciprocal = len(reciprocal_list)
 
    print("reciprocal:", reciprocal,len(unique_edges),"reciprocity:",reciprocal/float(len(unique_edges)),reciprocal/float(len(unique_edges)+reciprocal)) 
    return  unique_edges, reciprocal_list


In [4]:
!pwd

/Users/hsafdari/Dropbox/Dr_Caterina_de_Bacco/anomaly detection/data/input


In [5]:
!ls

[34mRD[m[m
airport.csv
airports.dat
data_description.md
dutch2.dat
dutch6.dat
facebook-wosn-wall.dat
pok_month0.dat
[34mpolblogs[m[m
[34mpolbooks[m[m
pre-process_WikiTalkht.ipynb
pre-process_dataset-POK0.ipynb
pre-process_dataset-airports.ipynb
pre-process_dataset-polblogs.ipynb
pre-process_dataset-polbooks.ipynb
pre-process_dataset-rewiring_polbooks.ipynb
pre-process_dataset.ipynb
pre-process_facebookwaall.ipynb
syn_500_3_20.0_0.8_True_0.dat
[34msynthetic[m[m
test.ipynb
theta_500_3_20.0_0.8_True_0.npz
twitter.dat
u.dat
u_syn.dat
v.dat
wiki_talk_ht.dat


In [8]:
outnet='airports'
input_data = pd.read_csv(outnet+'.dat', header=None, sep='\s+')

In [9]:
df = pd.DataFrame(input_data)
df.head(n=2)

Unnamed: 0,source,target,w
0,0,1,1
1,0,37,1


In [10]:
df[2] = 1
df.head(2)

Unnamed: 0,0,1,2
0,1,2,1
1,1,3,1


### Remove 'E' from first column

In [11]:
# df=df.drop(0,axis=1)
# df.columns=[0,1,2]


In [10]:
df.head(n=2)

Unnamed: 0,source,target,w
0,0,1,1
1,0,37,1


In [13]:
df['w'].unique()

array([1, 0])

In [14]:
len(df)

3586

### Keep only some of the edges, based on the third column.  
For POK dataset, keep only one month of data.

In [15]:
# onemonth = 60*60*24*30
# df=input_data[input_data[2]<onemonth]
# len(df)

### Keep only nodes that have at least one incoming and one outgoing edge

In [15]:
# ego=input_data[0].unique()
# alter=input_data[1].unique()
# df=input_data[input_data[0].isin(nodes)]

ego=df['0'].unique()
alter=df[1].unique()
nodes = set(ego).intersection(set(alter))
print(len(nodes))

df=df[df[0].isin(nodes)]
df=df[df[1].isin(nodes)]
len(df)

219


2904

### Build multigraph

In [17]:
edge_list=[(n[0],n[1],len(g)) for n,g in df.groupby(by=[0,1])]

In [33]:
G = nx.DiGraph()
G.add_weighted_edges_from(edge_list) # or G.add_edges_from(edge_list)

In [34]:
G = nx.DiGraph()
G.add_weighted_edges_from(df.values)

In [35]:
list(G.edges(data=True))[:5]

[(1, 6, {'weight': 1}),
 (1, 7, {'weight': 1}),
 (1, 9, {'weight': 1}),
 (1, 13, {'weight': 1}),
 (1, 14, {'weight': 1})]

### Remove self loops

In [36]:
G.remove_edges_from(list(nx.selfloop_edges(G)))
G.number_of_nodes(),G.number_of_edges()

(219, 2325)

In [37]:
G.out_degree()

OutDegreeView({1: 86, 6: 67, 7: 21, 9: 32, 13: 41, 14: 85, 16: 25, 17: 33, 19: 31, 21: 60, 23: 45, 25: 42, 26: 46, 27: 29, 29: 15, 30: 42, 31: 6, 33: 37, 34: 14, 37: 63, 39: 60, 40: 12, 41: 31, 42: 6, 44: 13, 47: 11, 49: 57, 50: 11, 53: 12, 54: 26, 56: 47, 57: 13, 58: 10, 60: 19, 61: 32, 63: 34, 65: 28, 66: 20, 67: 7, 70: 22, 74: 9, 76: 16, 77: 18, 78: 3, 79: 14, 81: 3, 82: 30, 83: 3, 95: 22, 98: 1, 100: 20, 106: 34, 110: 4, 119: 14, 137: 8, 145: 1, 154: 1, 156: 19, 165: 1, 169: 6, 171: 6, 174: 10, 183: 3, 186: 13, 189: 4, 195: 3, 202: 2, 212: 3, 214: 1, 228: 1, 259: 0, 261: 0, 266: 1, 267: 1, 5: 21, 8: 16, 12: 3, 28: 6, 105: 7, 208: 5, 88: 1, 104: 0, 107: 1, 179: 2, 191: 1, 240: 3, 250: 0, 99: 1, 198: 8, 10: 52, 11: 80, 32: 29, 36: 21, 64: 10, 75: 25, 115: 17, 129: 4, 131: 14, 133: 5, 192: 15, 193: 5, 196: 9, 197: 5, 200: 6, 203: 3, 207: 1, 211: 2, 213: 7, 220: 4, 224: 6, 235: 2, 239: 4, 244: 0, 248: 1, 251: 0, 276: 1, 292: 0, 45: 8, 48: 6, 136: 4, 153: 3, 163: 12, 170: 2, 181: 2, 194

### Remove nodes that have few out- or in-coming edges

In [38]:
outdegree = [ d[1] for d in list(G.out_degree())]
indegree = [ d[1] for d in list(G.in_degree())]

np.percentile(outdegree,50)

3.0

In [39]:
G1=nx.DiGraph(G)

In [25]:
threshold=0

Gnodes = list(G.nodes())
nodes_to_remove=[n for n in Gnodes if G.out_degree(n)<threshold]
nodes_to_remove.extend([n for n in Gnodes if G.in_degree(n)<threshold])
nodes_to_remove=list(set(nodes_to_remove))
len(nodes_to_remove),len(Gnodes)

G1.remove_nodes_from(list(nodes_to_remove))
print(len(nodes_to_remove))

0


In [26]:
G1.number_of_nodes(),G1.number_of_edges()

(219, 2904)

In [27]:
ncc = nx.number_weakly_connected_components(G1)
cc = list(nx.weakly_connected_components(G))
for c in range(ncc):
    print(c,len(cc[c]))

0 219


In [28]:
Gc = max(nx.weakly_connected_components(G1), key=len)
Gc = cc[3]
nodes_to_remove=set(G1.nodes()).difference(Gc)
print("removed" ,len(nodes_to_remove),' nodes')
G1.remove_nodes_from(list(nodes_to_remove))
print(G1.number_of_nodes(),G1.number_of_edges())

IndexError: list index out of range

In [40]:
unique_edges, reciprocal_list = reciprocal_edges(G1)

reciprocal: 0 2325 reciprocity: 0.0 0.0


In [41]:
G1nodes = list(G1.nodes())
pos = nx.spring_layout(G1)

In [42]:
# subG = np.random.choice(G1nodes,400)
# k = G1.subgraph(subG)  
k =G1

plt.figure()
nx.draw_networkx(k, pos=pos,with_labels=False,node_size=10)

<IPython.core.display.Javascript object>

In [594]:

# nx.draw(G1)

### Output on file

In [43]:
outfile = 'RD/'+outnet+'.dat'
# nx.write_weighted_edgelist(G, '../data/input/'+outnet+'.dat')
nx.write_weighted_edgelist(G, outfile)
print(outfile)

RD/airports.dat
