In [None]:
#Run this code to get node_day_recipients_synthetic.csv

import pandas as pd
import numpy as np
import re
from datetime import datetime
from collections import defaultdict
from tqdm import tqdm  # ✅ progress bar

# === Load mapping (SYNTHETIC VERSION) ===
id_email_df = pd.read_csv('id-email-synthetic.csv', header=None, names=['node_id', 'email'])
email_to_node = dict(zip(id_email_df['email'], id_email_df['node_id']))
total_nodes = len(id_email_df)

# === Load emails (SYNTHETIC VERSION) ===
emails = pd.read_csv('emails-synthetic.csv', names=['message'])

# === Helper: extract possibly multiline header field ===
def extract_header_field(header_name, message):
    """
    Extracts a header field (e.g., To, Cc, Bcc) including continuation lines.
    """
    # Match header start
    pattern = rf'^{header_name}:\s*(.*)'
    match = re.search(pattern, message, re.MULTILINE | re.IGNORECASE)
    if not match:
        return None

    # Start after this match to capture continuation lines
    start_pos = match.end()
    header_value = match.group(1).strip()

    # Find continuation lines (those starting with space or tab)
    continuation = []
    for cont_line in re.findall(r'^[ \t]+(.*)', message[start_pos:], re.MULTILINE):
        continuation.append(cont_line.strip())

    if continuation:
        header_value += " " + " ".join(continuation)

    return header_value.strip()

# === Extract fields ===
def extract_info(message):
    """Extract Date, From, To, Cc, and Bcc fields from raw email text."""
    date_match = re.search(r'^Date:\s*(.*)', message, re.MULTILINE | re.IGNORECASE)
    from_match = re.search(r'^From:\s*([^\n\r]*)', message, re.MULTILINE | re.IGNORECASE)

    # Extract multiline To, Cc, Bcc
    to_text  = extract_header_field('To', message)
    cc_text  = extract_header_field('Cc', message)
    bcc_text = extract_header_field('Bcc', message)

    # Parse date and addresses
    date = pd.to_datetime(date_match.group(1), errors='coerce', utc=True) if date_match else None
    senders = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', from_match.group(1)) if from_match else []

    # Combine recipients from To, Cc, Bcc
    recipients = []
    for field_text in (to_text, cc_text, bcc_text):
        if field_text:
            # Find emails (works with commas, semicolons, quoted names)
            recipients += re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', field_text)

    return date, senders, recipients

# === Process all emails ===
print("Extracting email info (To, Cc, Bcc with multi-line support)...")
email_data = [extract_info(msg) for msg in tqdm(emails['message'], desc="Parsing emails")]

# === Build node-day structure ===
total_days = 1448
base_date = pd.Timestamp('1999-01-01', tz='UTC')
node_day_structure = defaultdict(lambda: [[] for _ in range(total_days)])

print("Building node-day structure...")
for date, senders, recipients in tqdm(email_data, desc="Building structure"):
    if pd.isna(date) or not senders or not recipients:
        continue
    day_index = int((date - base_date).days)
    if 0 <= day_index < total_days:
        for sender in senders:
            s_node = email_to_node.get(sender)
            if s_node is None:
                continue
            for recipient in recipients:
                r_node = email_to_node.get(recipient)
                if r_node is not None:
                    node_day_structure[s_node][day_index].append(r_node)

# === Finalize output ===
print("Finalizing output data...")
output_data = []
for node_id in tqdm(range(total_nodes), desc="Processing nodes"):
    days = node_day_structure.get(node_id, [[] for _ in range(total_days)])
    # Remove duplicates and sort per day
    days = [sorted(set(day)) for day in days]
    output_data.append({
        'node_id': node_id,
        'day_recipients_str': str(days)
    })

# === Save CSV (SYNTHETIC VERSION) ===
output_df = pd.DataFrame(output_data)
output_df.to_csv('node_day_recipients_synthetic.csv', index=False)

print("✓ node_day_recipients_synthetic.csv created with To, Cc, Bcc (multiline and comma-separated supported)")

Extracting email info (To, Cc, Bcc with multi-line support)...


Parsing emails: 100%|██████████| 517402/517402 [08:54<00:00, 968.27it/s] 


Building node-day structure...


Building structure: 100%|██████████| 517402/517402 [00:09<00:00, 57128.58it/s] 


Finalizing output data...


Processing nodes: 100%|██████████| 6600/6600 [00:07<00:00, 861.85it/s] 


✓ node_day_recipients.csv created with To, Cc, Bcc (multiline and comma-separated supported)


In [2]:
#Run this code to get node_day_recipients_synthetic.csv

import pandas as pd
import numpy as np
import re
from datetime import datetime
from collections import defaultdict
from tqdm import tqdm  # ✅ progress bar

# === Load mapping (SYNTHETIC VERSION) ===
id_email_df = pd.read_csv('id-email-synthetic.csv', header=None, names=['node_id', 'email'])
email_to_node = dict(zip(id_email_df['email'], id_email_df['node_id']))
total_nodes = len(id_email_df)

# === Load emails (SYNTHETIC VERSION) ===
emails = pd.read_csv('emails_synthetic.csv', names=['message'])

# === Helper: extract possibly multiline header field ===
def extract_header_field(header_name, message):
    """
    Extracts a header field (e.g., To, Cc, Bcc) including continuation lines.
    """
    # Match header start
    pattern = rf'^{header_name}:\s*(.*)'
    match = re.search(pattern, message, re.MULTILINE | re.IGNORECASE)
    if not match:
        return None

    # Start after this match to capture continuation lines
    start_pos = match.end()
    header_value = match.group(1).strip()

    # Find continuation lines (those starting with space or tab)
    continuation = []
    for cont_line in re.findall(r'^[ \t]+(.*)', message[start_pos:], re.MULTILINE):
        continuation.append(cont_line.strip())

    if continuation:
        header_value += " " + " ".join(continuation)

    return header_value.strip()

# === Extract fields ===
def extract_info(message):
    """Extract Date, From, To, Cc, and Bcc fields from raw email text."""
    date_match = re.search(r'^Date:\s*(.*)', message, re.MULTILINE | re.IGNORECASE)
    from_match = re.search(r'^From:\s*([^\n\r]*)', message, re.MULTILINE | re.IGNORECASE)

    # Extract multiline To, Cc, Bcc
    to_text  = extract_header_field('To', message)
    cc_text  = extract_header_field('Cc', message)
    bcc_text = extract_header_field('Bcc', message)

    # Parse date and addresses
    date = pd.to_datetime(date_match.group(1), errors='coerce', utc=True) if date_match else None
    senders = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', from_match.group(1)) if from_match else []

    # Combine recipients from To, Cc, Bcc
    recipients = []
    for field_text in (to_text, cc_text, bcc_text):
        if field_text:
            # Find emails (works with commas, semicolons, quoted names)
            recipients += re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', field_text)

    return date, senders, recipients

# === Process all emails ===
print("Extracting email info (To, Cc, Bcc with multi-line support)...")
email_data = [extract_info(msg) for msg in tqdm(emails['message'], desc="Parsing emails")]

# === Build node-day structure ===
total_days = 1448
base_date = pd.Timestamp('1999-01-01', tz='UTC')
node_day_structure = defaultdict(lambda: [[] for _ in range(total_days)])

print("Building node-day structure...")
for date, senders, recipients in tqdm(email_data, desc="Building structure"):
    if pd.isna(date) or not senders or not recipients:
        continue
    day_index = int((date - base_date).days)
    if 0 <= day_index < total_days:
        for sender in senders:
            s_node = email_to_node.get(sender)
            if s_node is None:
                continue
            for recipient in recipients:
                r_node = email_to_node.get(recipient)
                if r_node is not None:
                    node_day_structure[s_node][day_index].append(r_node)

# === Finalize output ===
print("Finalizing output data...")
output_data = []
for node_id in tqdm(range(total_nodes), desc="Processing nodes"):
    days = node_day_structure.get(node_id, [[] for _ in range(total_days)])
    # Remove duplicates and sort per day
    days = [sorted(set(day)) for day in days]
    output_data.append({
        'node_id': node_id,
        'day_recipients_str': str(days)
    })

# === Save CSV (SYNTHETIC VERSION) ===
output_df = pd.DataFrame(output_data)
output_df.to_csv('node_day_recipients_synthetic.csv', index=False)

print("✓ node_day_recipients_synthetic.csv created with To, Cc, Bcc (multiline and comma-separated supported)")

Extracting email info (To, Cc, Bcc with multi-line support)...


Parsing emails: 100%|██████████| 517467/517467 [09:52<00:00, 873.16it/s] 


Building node-day structure...


Building structure: 100%|██████████| 517467/517467 [00:18<00:00, 28622.96it/s]


Finalizing output data...


Processing nodes: 100%|██████████| 6604/6604 [00:08<00:00, 756.48it/s] 


✓ node_day_recipients_synthetic.csv created with To, Cc, Bcc (multiline and comma-separated supported)
