In [None]:
import networkx as nx
import os
import pickle

# CONFIGURATION
DATA_DIR = 'Data/twitter' 
OUTPUT_GRAPH_PATH = 'models/twitter_graph.gpickle'
MAX_NODES = 10000 # Limit graph size for laptop performance

def build_twitter_graph(data_dir):
    G = nx.Graph()
    
    # Get all "Ego" IDs (the filenames act as IDs)
    # We look for .edges files to identify the ego networks
    files = [f for f in os.listdir(data_dir) if f.endswith('.edges')]
    ego_ids = [f.split('.')[0] for f in files]
    
    print(f"--- Found {len(ego_ids)} Ego Networks ---")
    
    node_count = 0
    
    for ego_id in ego_ids:
        if node_count >= MAX_NODES:
            break
            
        print(f"Processing Ego Network: {ego_id}...")
        
        # 1. Add User-User Edges (The "Follow" Graph)
        edge_file = os.path.join(data_dir, f"{ego_id}.edges")
        if os.path.exists(edge_file):
            with open(edge_file, 'r', encoding='utf-8') as f:
                for line in f:
                    u, v = line.strip().split()
                    G.add_node(u, type='user')
                    G.add_node(v, type='user')
                    G.add_edge(u, v, relation='follows')
        
        # 2. Add User-Feature Edges (The "Interest" Graph)
        # .feat file contains binary vectors (0 0 1 0 1...)
        # .featnames file contains what those 1s mean (e.g., "234 school:Stanford")
        
        feat_path = os.path.join(data_dir, f"{ego_id}.feat")
        feat_names_path = os.path.join(data_dir, f"{ego_id}.featnames")
        
        if os.path.exists(feat_path) and os.path.exists(feat_names_path):
            # Load feature definitions
            # Map index 234 -> "school:Stanford"
            feature_map = {} 
            with open(feat_names_path, 'r', encoding='utf-8', errors='ignore') as f:
                for line in f:
                    parts = line.strip().split(' ')
                    feat_id = parts[0]
                    feat_name = " ".join(parts[1:]) # e.g., "school;stanford"
                    feature_map[int(feat_id)] = feat_name

            # Load user features
            with open(feat_path, 'r', encoding='utf-8', errors='ignore') as f:
                for line in f:
                    parts = line.strip().split(' ')
                    user_id = parts[0]
                    features = parts[1:]
                    
                    # If this user is in our graph, link them to their features
                    if G.has_node(user_id):
                        for idx, val in enumerate(features):
                            if val == '1' and idx in feature_map:
                                feat_node_name = f"Feat: {feature_map[idx]}"
                                # Add the Feature Node
                                G.add_node(feat_node_name, type='feature')
                                # Link User -> Feature
                                G.add_edge(user_id, feat_node_name, relation='has_interest')

        node_count = G.number_of_nodes()

    print(f"--- Graph Built ---")
    print(f"Total Nodes: {G.number_of_nodes()}")
    print(f"Total Edges: {G.number_of_edges()}")
    
    # Filter: Remove isolated nodes to clean the graph
    print("Removing isolated nodes...")
    G.remove_nodes_from(list(nx.isolates(G)))
    print(f"Final Node Count: {G.number_of_nodes()}")
    
    return G

def save_graph(G, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'wb') as f:
        pickle.dump(G, f)
    print(f"--- Graph saved to {output_path} ---")

if __name__ == "__main__":
    if not os.path.exists(DATA_DIR):
        print(f"Error: Directory {DATA_DIR} not found. Please extract twitter.tar.gz there.")
    else:
        twitter_graph = build_twitter_graph(DATA_DIR)
        save_graph(twitter_graph, OUTPUT_GRAPH_PATH)

--- Found 973 Ego Networks ---
Processing Ego Network: 100318079...
Processing Ego Network: 10146102...
Processing Ego Network: 101859065...
Processing Ego Network: 101903164...
Processing Ego Network: 102765423...
Processing Ego Network: 102903198...
Processing Ego Network: 103431502...
Processing Ego Network: 103865085...
Processing Ego Network: 103991905...
Processing Ego Network: 104324908...
Processing Ego Network: 104615636...
Processing Ego Network: 1046661...
Processing Ego Network: 104991493...
Processing Ego Network: 105150583...
Processing Ego Network: 105398724...
Processing Ego Network: 105918870...
--- Graph Built ---
Total Nodes: 10160
Total Edges: 53548
Removing isolated nodes...
Final Node Count: 10160
--- Graph saved to ./twitter_graph.gpickle ---
