In [2]:
import numpy as np
import pandas as pd
import re
from datetime import datetime
import networkx as nx

# ---------------- Step 0: Load emails properly ----------------
with open("emails.csv", "r", encoding="utf-8") as f:
    raw_text = f.read()

# Split emails by blank lines (adjust if separator is different)
emails = raw_text.split("\n\n")

# Build DataFrame
df = pd.DataFrame({"message": emails})

# ---------------- Step 1: Extract Date, From, To ----------------
def get_date_from_to(Series):
    result_date = pd.Series(index=Series.index, dtype="object")
    result_from = pd.Series(index=Series.index, dtype="object")
    result_to = pd.Series(index=Series.index, dtype="object")
    
    for row, message in enumerate(Series):
        if row % 1000 == 0:
            print(f'Processing row {row}...')
        
        message_words = message.split('\n')
        
        # Date
        if len(message_words) > 1 and 'Date:' in message_words[1]:
            result_date[row] = message_words[1].replace('Date:', '').strip()
        else:
            result_date[row] = np.nan
        
        # From - store as list or None
        if len(message_words) > 2 and 'From:' in message_words[2]:
            emails_found = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', message_words[2])
            result_from[row] = emails_found if emails_found else None
        else:
            result_from[row] = None
        
        # To - store as list or None
        if len(message_words) > 3 and 'To:' in message_words[3]:
            emails_found = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', message_words[3])
            result_to[row] = emails_found if emails_found else None
        else:
            result_to[row] = None
    
    print('Converting dates...')
    result_date = pd.to_datetime(result_date, errors="coerce")
    result_date = result_date - datetime(1999, 1, 1)
    
    return pd.DataFrame({
        "date": result_date,
        "senders": result_from,
        "recipients": result_to
    })

# ---------------- Step 2: Build Graph ----------------
def gen_graph(date_from_to):
    G = nx.Graph()
    
    for idx, row in date_from_to.iterrows():
        senders = row.senders
        recipients = row.recipients
        date_val = row.date
        
        # Skip if any field is None/NaN or if date is NaT
        if (senders is None or recipients is None or 
            pd.isna(date_val) or pd.isna(senders) or pd.isna(recipients)):
            continue
        
        # Skip if empty lists
        if not senders or not recipients:
            continue
        
        for sender in senders:
            if sender not in G:
                G.add_node(sender)
            
            for recipient in recipients:
                if recipient not in G:
                    G.add_node(recipient)
                
                if not G.has_edge(sender, recipient):
                    G.add_edge(sender, recipient, count=0, days=[])
                
                G[sender][recipient]["count"] += 1
                G[sender][recipient]["days"].append(int(date_val.days))
    
    return G

# ---------------- Step 3: Export edges ----------------
def export_edges(G):
    data = []
    for u, v, attrs in G.edges(data=True):
        days = sorted(set(attrs["days"]))  # unique & sorted
        data.append({
            "From": u,
            "To": v,
            "Count": attrs["count"],
            "Days": days
        })
    return pd.DataFrame(data)

# ---------------- Example usage ----------------
try:
    print("Step 1: Extracting date, from, to...")
    date_from_to = get_date_from_to(df["message"])
    print(f"Extracted data shape: {date_from_to.shape}")
    print("Sample of extracted data:")
    print(date_from_to.head())
    print()
    
    print("Step 2: Building graph...")
    G = gen_graph(date_from_to)
    print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
    print()
    
    print("Step 3: Exporting edges...")
    edges_df = export_edges(G)
    print(f"Edges DataFrame shape: {edges_df.shape}")
    
    if len(edges_df) > 0:
        print("Sample edges:")
        print(edges_df.head())
        edges_df.to_csv("edges_output.csv", index=False)
        print("Successfully saved to edges_output.csv")
    else:
        print("No edges found - the output file will be empty")
        # Still create the file with headers
        pd.DataFrame(columns=["From", "To", "Count", "Days"]).to_csv("edges_output.csv", index=False)
        print("Created empty edges_output.csv with headers")
        
except Exception as e:
    print(f"Error occurred: {type(e).__name__}: {e}")
    import traceback
    traceback.print_exc()

ModuleNotFoundError: No module named 'numpy'