In [32]:
import networkx as nx
import pandas as pd
import tarfile
from io import BytesIO

# Replace 'facebook.tar.gz' with the actual file name
tar_filename = 'facebook.tar.gz'

# Extract the files from the tar.gz archive
with tarfile.open(tar_filename, 'r:gz') as tar:
    edge_file = None
    feat_files,circ_files,ego_files,egof_files,fn_files = [],[],[],[],[]
    # Iterate through all files in the tar archive
    for member in tar.getmembers():
        # Check for edge file
        if member.name.endswith('.edges'):
            edge_file = member.name
        # Check for feat files
        elif member.name.endswith('.feat'):
            feat_files.append(member.name)
        elif member.name.endswith('.circles'):
            circ_files.append(member.name)
        elif member.name.endswith('.featnames'):
            fn_files.append(member.name)
        elif member.name.endswith('.egofeat'):
            egof_files.append(member.name)

    # Read the edge list
    edges = pd.read_csv(tar.extractfile(edge_file), delimiter=' ', header=None, names=['node1', 'node2'])

    # Read the node features
    feature_data = []
    for feat_file in feat_files:
        features = pd.read_csv(tar.extractfile(feat_file), delimiter=' ', header=None, names=['node'] + [f'feature_{i}' for i in range(1, 10)])
        feature_data.append(features)
    # Read the node features
    circles_data = []
    for circ_file in circ_files:
        circ = pd.read_csv(tar.extractfile(circ_file), delimiter=' ', header=None, names=['node'] + [f'circ_{i}' for i in range(1, 10)])
        circles_data.append(circ)
    # Read the node features
    fn_data = []
    for fn_file in fn_files:
        fn = pd.read_csv(tar.extractfile(fn_file), delimiter=' ', header=None, names=['node'] + [f'feat_name_{i}' for i in range(1, 10)])
        fn_data.append(fn)
    # Read the node features
    egof_data = []
    for egof_file in egof_files:
        egof = pd.read_csv(tar.extractfile(egof_file), delimiter=' ', header=None, names=['node'] + [f'ego_feature_{i}' for i in range(1, 10)])
        egof_data.append(egof)

# Concatenate the feature DataFrames and create a graph
features = pd.concat(feature_data, ignore_index=True)
circles = pd.concat(circles_data, ignore_index=True)
featnames = pd.concat(fn_data, ignore_index=True)
egonames = pd.concat(egof_data, ignore_index=True)
G = nx.from_pandas_edgelist(edges, source='node1', target='node2')
features.set_index('node', inplace=True)

In [37]:
edges.head(3)

Unnamed: 0,node1,node2
0,827,819
1,830,826
2,828,752


In [38]:
features.head(3)

Unnamed: 0_level_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0


In [39]:
circles.head(3)

Unnamed: 0,node,circ_1,circ_2,circ_3,circ_4,circ_5,circ_6,circ_7,circ_8,circ_9
0,circle0\t3518\t3860\t3605\t3488\t3938\t3574\t3...,,,,,,,,,
1,circle1\t3786\t3623\t3947\t3782,,,,,,,,,
2,circle2\t3963\t3771\t3939\t3682\t3624\t3894\t3949,,,,,,,,,


In [40]:
featnames.head(3)

Unnamed: 0,node,feat_name_1,feat_name_2,feat_name_3,feat_name_4,feat_name_5,feat_name_6,feat_name_7,feat_name_8,feat_name_9
0,0,birthday;anonymized,feature,0,,,,,,
1,1,birthday;anonymized,feature,1,,,,,,
2,2,birthday;anonymized,feature,2,,,,,,
