In [3]:
# Simple Node-Day Recipients Structure Generator
# This creates a CSV where each node has 1448 days of recipient lists

import pandas as pd
import numpy as np
import re
from datetime import datetime
from collections import defaultdict
import ast

print("=== Simple Node-Day Recipients Generator ===")
print("Creating node-day structure with recipient lists for each day")
print()

# 1. Load ID-Email mapping
print("1. Loading ID-Email mapping...")
id_email_df = pd.read_csv('id-email.csv', header=None, names=['node_id', 'email'])
print(f"Loaded {len(id_email_df)} email mappings")

# Create email to node_id mapping
email_to_node = {}
for _, row in id_email_df.iterrows():
    email_to_node[row['email']] = row['node_id']
print(f"Created mapping for {len(email_to_node)} emails")
print()

# 2. Process emails using the proven method
print("2. Processing emails...")

# Read emails and filter as in the working code
emails = pd.read_csv('emails.csv')
print(f"Total emails loaded: {len(emails)}")

# Filter emails (remove discussion threads and auto-emails)
emails_nodups = emails[~emails.file.str.contains('discussion_thread', na=False)]
emails_noautos = emails_nodups[~emails_nodups.file.str.contains('all_documents', na=False)]
emails_noautos = emails_noautos.reset_index(drop=True)
print(f"Emails after filtering: {len(emails_noautos)}")

# Extract email information using proven method
print("Extracting email information...")

def get_date_from_to(Series):
    result_date = pd.Series(index=Series.index)
    result_from = pd.Series(index=Series.index)
    result_to = pd.Series(index=Series.index)
    
    for row, message in enumerate(Series):
        if row % 10000 == 0:
            print(f'Row: {row} starting at {datetime.now()}.')
        
        message_words = message.split('\n')
        
        # Extract date
        if len(message_words) > 1 and 'Date:' in message_words[1]:
            result_date[row] = message_words[1].replace('Date:', '')
        else:
            result_date[row] = np.nan
        
        # Extract sender
        if len(message_words) > 2 and 'From:' in message_words[2]:
            result_from[row] = re.findall('[\w\.-]+@[\w\.-]+\.\w+', message_words[2])
        else:
            result_from[row] = np.nan
        
        # Extract recipient
        if len(message_words) > 3 and 'To:' in message_words[3]:
            result_to[row] = re.findall('[\w\.-]+@[\w\.-]+\.\w+', message_words[3])
        else:
            result_to[row] = np.nan
    
    # Convert dates
    print('Converting dates...')
    result_date = pd.to_datetime(result_date, errors='coerce', utc=True)
    base_date = pd.Timestamp('1999-01-01', tz='UTC')
    # Ensure we have a proper Series for arithmetic
    result_date = pd.Series(result_date) - base_date
    
    return result_date, result_from, result_to

# Extract email data
date_from_to = pd.DataFrame()
date_from_to['date'], date_from_to['senders'], date_from_to['recipients'] = get_date_from_to(emails_noautos.message)
print(f"Processed emails: {len(date_from_to)}")

# Filter by date range
print("Filtering by date range...")
date_from_to.dropna(inplace=True)
date_from_to = date_from_to[date_from_to.date >= pd.Timedelta(0)]
date_from_to = date_from_to[date_from_to.date <= pd.Timedelta(days=1448)]
print(f"Final email count: {len(date_from_to)}")

# 3. Create node-day structure
print("3. Creating node-day structure...")
total_days = 1448
node_day_structure = defaultdict(lambda: [[] for _ in range(total_days)])

# Process each email
for index, row in date_from_to.iterrows():
    # Check if senders and recipients are not NaN and are lists
    if (row.senders is not None and row.senders != [] and 
        row.recipients is not None and row.recipients != []):
        for sender in row.senders:
            sender_node = email_to_node.get(sender)
            if sender_node is not None:
                for recipient in row.recipients:
                    recipient_node = email_to_node.get(recipient)
                    if recipient_node is not None:
                        # Calculate day index
                        day_index = int(row.date.days)
                        if 0 <= day_index < total_days:
                            node_day_structure[sender_node][day_index].append(recipient_node)

print(f"Created structure for {len(node_day_structure)} nodes")

# 4. Clean and organize data
print("4. Cleaning and organizing data...")
for node_id in node_day_structure:
    for day_index in range(total_days):
        # Remove duplicates and sort
        node_day_structure[node_id][day_index] = sorted(list(set(node_day_structure[node_id][day_index])))

# 5. Convert to output format
print("5. Converting to output format...")
output_data = []
for node_id in sorted(node_day_structure.keys()):
    day_recipients = node_day_structure[node_id]
    output_data.append({
        'node_id': node_id,
        'day_recipients': day_recipients
    })

print(f"Prepared data for {len(output_data)} nodes")

# 6. Save to CSV
print("6. Saving to CSV...")
output_df = pd.DataFrame(output_data)
output_df['day_recipients_str'] = output_df['day_recipients'].apply(lambda x: str(x))
output_df[['node_id', 'day_recipients_str']].to_csv('node_day_recipients.csv', index=False)

print("✓ Saved to node_day_recipients.csv")
print(f"File contains {len(output_df)} nodes with {total_days} days each")

# 7. Show statistics
print("\n=== Statistics ===")
print(f"Total nodes: {len(output_data)}")
print(f"Total days per node: {total_days}")

# Count non-empty days
non_empty_days = []
for node_data in output_data:
    non_empty_count = sum(1 for day in node_data['day_recipients'] if day)
    non_empty_days.append(non_empty_count)

if non_empty_days:
    print(f"Average non-empty days per node: {np.mean(non_empty_days):.2f}")
    print(f"Max non-empty days for any node: {max(non_empty_days)}")
    print(f"Min non-empty days for any node: {min(non_empty_days)}")

# Show sample
print("\n=== Sample Structure ===")
if output_data:
    sample_node = output_data[0]
    print(f"Node {sample_node['node_id']} first 5 days:")
    for i, day in enumerate(sample_node['day_recipients'][:5]):
        print(f"  Day {i}: {day}")

print("\n=== Processing Complete ===")
print("✓ File 'node_day_recipients.csv' created successfully!")
print("Format: Each row contains node_id and day_recipients_str")
print("day_recipients_str is a string representation of a list of 1448 days")
print("Each day contains a list of recipient node IDs that received emails from that sender")


=== Simple Node-Day Recipients Generator ===
Creating node-day structure with recipient lists for each day

1. Loading ID-Email mapping...
Loaded 6600 email mappings
Created mapping for 6600 emails

2. Processing emails...
Total emails loaded: 517401
Emails after filtering: 330689
Extracting email information...
Row: 0 starting at 2025-10-19 13:06:01.898476.


  result_date[row] = message_words[1].replace('Date:', '')
  result_from[row] = re.findall('[\w\.-]+@[\w\.-]+\.\w+', message_words[2])
  result_to[row] = re.findall('[\w\.-]+@[\w\.-]+\.\w+', message_words[3])


Row: 10000 starting at 2025-10-19 13:06:02.067285.
Row: 20000 starting at 2025-10-19 13:06:02.234363.
Row: 30000 starting at 2025-10-19 13:06:02.389897.
Row: 40000 starting at 2025-10-19 13:06:02.554439.
Row: 50000 starting at 2025-10-19 13:06:02.745816.
Row: 60000 starting at 2025-10-19 13:06:02.942101.
Row: 70000 starting at 2025-10-19 13:06:03.122476.
Row: 80000 starting at 2025-10-19 13:06:03.286303.
Row: 90000 starting at 2025-10-19 13:06:03.438858.
Row: 100000 starting at 2025-10-19 13:06:03.600077.
Row: 110000 starting at 2025-10-19 13:06:03.767840.
Row: 120000 starting at 2025-10-19 13:06:03.919601.
Row: 130000 starting at 2025-10-19 13:06:04.079938.
Row: 140000 starting at 2025-10-19 13:06:04.241290.
Row: 150000 starting at 2025-10-19 13:06:04.418779.
Row: 160000 starting at 2025-10-19 13:06:04.583319.
Row: 170000 starting at 2025-10-19 13:06:04.743772.
Row: 180000 starting at 2025-10-19 13:06:04.901423.
Row: 190000 starting at 2025-10-19 13:06:05.057796.
Row: 200000 starting 

  result_date = pd.to_datetime(result_date, errors='coerce', utc=True)


Processed emails: 330689
Filtering by date range...
Final email count: 313191
3. Creating node-day structure...
Created structure for 5776 nodes
4. Cleaning and organizing data...
5. Converting to output format...
Prepared data for 5776 nodes
6. Saving to CSV...
✓ Saved to node_day_recipients.csv
File contains 5776 nodes with 1448 days each

=== Statistics ===
Total nodes: 5776
Total days per node: 1448
Average non-empty days per node: 12.78
Max non-empty days for any node: 568
Min non-empty days for any node: 1

=== Sample Structure ===
Node 0 first 5 days:
  Day 0: []
  Day 1: []
  Day 2: []
  Day 3: []
  Day 4: []

=== Processing Complete ===
✓ File 'node_day_recipients.csv' created successfully!
Format: Each row contains node_id and day_recipients_str
day_recipients_str is a string representation of a list of 1448 days
Each day contains a list of recipient node IDs that received emails from that sender
