# Train Test Split

In [14]:
# 📦 Import Required Libraries
import os
import random
import pandas as pd

# 📍 Set base paths
facebook_edges_path = 'data/facebook/edges.txt'
pokec_relationships_path = 'data/pokec/relationships.txt'

# Create folders if not exist
os.makedirs('data/facebook/', exist_ok=True)
os.makedirs('data/pokec/', exist_ok=True)

# 📚 Split Function (Reusable for both Facebook and Pokec)
def split_edges(input_path, output_folder, train_filename, test_filename, test_size=0.2, seed=42):
    """
    Splits the friendship/relationship edges into train and test sets.
    """
    random.seed(seed)

    # Step 1: Load edges
    with open(input_path, 'r', encoding='utf-8') as f:
        edges = [tuple(map(int, line.strip().split())) for line in f]

    print(f"Total edges loaded from {input_path}: {len(edges)}")

    # Step 2: Shuffle and split
    random.shuffle(edges)
    split_idx = int((1 - test_size) * len(edges))
    train_edges = edges[:split_idx]
    test_edges = edges[split_idx:]

    print(f"Train edges: {len(train_edges)}, Test edges: {len(test_edges)}")

    # Step 3: Save
    with open(os.path.join(output_folder, train_filename), 'w', encoding='utf-8') as f:
        for u, v in train_edges:
            f.write(f"{u} {v}\n")

    with open(os.path.join(output_folder, test_filename), 'w', encoding='utf-8') as f:
        for u, v in test_edges:
            f.write(f"{u} {v}\n")

    print(f"✅ Train/Test files saved successfully inside {output_folder}!\n")

# 📦 Split Facebook dataset
split_edges(
    input_path=facebook_edges_path,
    output_folder='data/facebook/',
    train_filename='train_edges.txt',
    test_filename='test_edges.txt',
    test_size=0.2,
    seed=42
)

# 📦 Split Pokec dataset
split_edges(
    input_path=pokec_relationships_path,
    output_folder='data/pokec/',
    train_filename='train_relationships.txt',
    test_filename='test_relationships.txt',
    test_size=0.2,
    seed=42
)

print("🎯 Done! Now both Facebook and Pokec datasets have Train/Test splits ready!")

Total edges loaded from data/facebook/edges.txt: 170174
Train edges: 136139, Test edges: 34035
✅ Train/Test files saved successfully inside data/facebook/!

Total edges loaded from data/pokec/relationships.txt: 30622564
Train edges: 24498051, Test edges: 6124513
✅ Train/Test files saved successfully inside data/pokec/!

🎯 Done! Now both Facebook and Pokec datasets have Train/Test splits ready!
