# FB-WOSN-FRIENDS

Reference: http://networkrepository.com/fb-wosn-friends.php

**Describe**: 
> Facebook friendship graph where nodes are users and edges between the users represent friendship relations/edges. The fourth column represents the time of an edge (unix timestamp). Note that some edge timestamps are missing and are assigned 0.

# Library

In [63]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import requests
import zipfile
from datetime import datetime
import os

# Download

In [64]:
link_dts = 'http://nrvis.com/download/data/dynamic/fb-wosn-friends.zip'
dts_zip = 'fb-wosn-friends.zip'
dts_name = 'fb-wosn-friends.edges'

In [65]:
r1 = requests.get(link_dts, allow_redirects=True)
open(dts_zip, 'wb').write(r1.content)

7481438

In [66]:
with zipfile.ZipFile(dts_zip, 'r') as zip_ref:
    zip_ref.extractall()

# Handle data

In [67]:
df = None
with open(dts_name, 'r') as fi:
    lines = fi.readlines() 
    print(lines[:6])
    lines = lines[4:]
    lines_ = [list(map(int, line.strip().split())) for line in lines ]
    print(lines_[:4])
    df = pd.DataFrame(data=lines_, columns=['node_1', 'node_2', 'group', 'timestamp'])

print()
print(df.dtypes)


['% sym unweighted\n', '% 1269502 63731 63731\n', '1 2 1 0\n', '1 2 1 1202446730\n', '1 3 1 0\n', '1 4 1 0\n']
[[1, 3, 1, 0], [1, 4, 1, 0], [1, 4, 1, 1213591678], [1, 5, 1, 0]]

node_1       int64
node_2       int64
group        int64
timestamp    int64
dtype: object


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1269500 entries, 0 to 1269499
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   node_1     1269500 non-null  int64
 1   node_2     1269500 non-null  int64
 2   group      1269500 non-null  int64
 3   timestamp  1269500 non-null  int64
dtypes: int64(4)
memory usage: 38.7 MB


In [69]:
df.describe()

Unnamed: 0,node_1,node_2,group,timestamp
count,1269500.0,1269500.0,1269500.0,1269500.0
mean,13732.85,24524.45,1.0,750726800.0
std,11300.7,13828.4,0.0,586897300.0
min,1.0,3.0,1.0,0.0
25%,4743.0,13426.0,1.0,0.0
50%,11041.0,23626.0,1.0,1190734000.0
75%,20247.0,34407.0,1.0,1219291000.0
max,63697.0,63731.0,1.0,1232576000.0


We will drop `group` column and which row has value `timestamp = 0`. We can not create a temporal network without getting time stamp

In [70]:
df.drop(columns='group', inplace=True)

In [71]:
df = df[df.timestamp != 0]

In [72]:
df.describe()

Unnamed: 0,node_1,node_2,timestamp
count,788134.0,788134.0,788134.0
mean,14852.746297,26157.133782,1209246000.0
std,11659.723044,13860.294103,19069040.0
min,1.0,3.0,1157455000.0
25%,5494.0,15213.0,1196929000.0
50%,12017.0,25490.0,1215498000.0
75%,21975.0,35869.0,1224544000.0
max,63697.0,63731.0,1232576000.0


# Creating dynamic graph
Divide timestamp to `k` bin means `k` graph. Afterthat, we have 1 dynamic graph with `k` snapshot (static graph)

In [73]:
k = 25

In [74]:
timestamp_range = (df.timestamp.max() - df.timestamp.min() + 1)//k 
timestamp_range

3004847

In [75]:
graphs_df = []
print("Start time: ", datetime.fromtimestamp(df.timestamp.min()) )
for i in range(k):
    upper_time = df.timestamp.min() + timestamp_range*(i+1)
    print(f"[{i}|\tUpper_time= {datetime.fromtimestamp(upper_time)}\t |Row|= {len(df[df.timestamp<upper_time])}")
    if i == k-1:
        graph_df = df.copy()
    else:
        graph_df = df[df.timestamp<upper_time].copy()
    graphs_df.append(graph_df)

Start time:  2006-09-05 11:15:29
[0|	Upper_time= 2006-10-10 05:56:16	 |Row|= 8861
[1|	Upper_time= 2006-11-14 00:37:03	 |Row|= 19356
[2|	Upper_time= 2006-12-18 19:17:50	 |Row|= 29801
[3|	Upper_time= 2007-01-22 13:58:37	 |Row|= 40598
[4|	Upper_time= 2007-02-26 08:39:24	 |Row|= 52674
[5|	Upper_time= 2007-04-02 03:20:11	 |Row|= 65483
[6|	Upper_time= 2007-05-06 22:00:58	 |Row|= 78461
[7|	Upper_time= 2007-06-10 16:41:45	 |Row|= 93314
[8|	Upper_time= 2007-07-15 11:22:32	 |Row|= 110335
[9|	Upper_time= 2007-08-19 06:03:19	 |Row|= 131464
[10|	Upper_time= 2007-09-23 00:44:06	 |Row|= 151754
[11|	Upper_time= 2007-10-27 19:24:53	 |Row|= 173696
[12|	Upper_time= 2007-12-01 14:05:40	 |Row|= 194268
[13|	Upper_time= 2008-01-05 08:46:27	 |Row|= 210541
[14|	Upper_time= 2008-02-09 03:27:14	 |Row|= 227698
[15|	Upper_time= 2008-03-14 22:08:01	 |Row|= 248294
[16|	Upper_time= 2008-04-18 16:48:48	 |Row|= 288180
[17|	Upper_time= 2008-05-23 11:29:35	 |Row|= 332550
[18|	Upper_time= 2008-06-27 06:10:22	 |Row|= 37874

In [76]:
graphs = []
for i in range(k):
    g = nx.from_pandas_edgelist(graphs_df[i], "node_1", "node_2", create_using=nx.Graph())
    graphs.append(g)
    print(f"Graph {i+1}:\t|V|={g.number_of_nodes()}\t|E|={g.number_of_edges()}")

Graph 1:	|V|=4664	|E|=7230
Graph 2:	|V|=7005	|E|=15753
Graph 3:	|V|=8624	|E|=24250
Graph 4:	|V|=9944	|E|=33001
Graph 5:	|V|=11210	|E|=42760
Graph 6:	|V|=12385	|E|=53116
Graph 7:	|V|=13535	|E|=63574
Graph 8:	|V|=14816	|E|=75523
Graph 9:	|V|=16203	|E|=89113
Graph 10:	|V|=17770	|E|=105774
Graph 11:	|V|=19568	|E|=121876
Graph 12:	|V|=21692	|E|=139376
Graph 13:	|V|=23795	|E|=155790
Graph 14:	|V|=25510	|E|=168976
Graph 15:	|V|=27582	|E|=182773
Graph 16:	|V|=29890	|E|=199180
Graph 17:	|V|=32247	|E|=230491
Graph 18:	|V|=34656	|E|=265211
Graph 19:	|V|=37452	|E|=301329
Graph 20:	|V|=40925	|E|=343844
Graph 21:	|V|=44756	|E|=391987
Graph 22:	|V|=48700	|E|=445780
Graph 23:	|V|=53127	|E|=507331
Graph 24:	|V|=58502	|E|=580808
Graph 25:	|V|=61096	|E|=614796


# Save dynamic graph

In [77]:
NUMBER_SAVE_GRAPH = 10

In [78]:
folder = "../data/fb"
if not os.path.exists(folder):
    os.makedirs(folder)

In [79]:
for i in range(min(NUMBER_SAVE_GRAPH, k)):
    nx.write_edgelist(graphs[i],f'{folder}/graph_{str(i//10)+str(i%10)}.edgelist',data=False)