In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

ModuleNotFoundError: No module named 'google'

In [None]:
cd drive/MyDrive/IE7374_Group14/colab/

In [None]:
import pandas as pd
import numpy as np

import os
import re
import random
from multiprocessing import Pool

import email
from email import policy
from email.policy import default

from email.parser import BytesParser
from email import message_from_string

import spacy


# 1. Load Dataset

In [None]:
df = pd.read_csv('../data/sample_raw_dataset.csv')

In [None]:
df.info()

In [None]:
duplicates = df[df.duplicated(subset='message', keep=False)]
print(f"Number of duplicate messages: {duplicates.shape[0]}")

In [None]:
df.head(3)

# 2. Split Message-ID

In [None]:
df_split1 = df.copy()

In [None]:
def split_metadata_and_body(email_text):

    pattern = r'(Message-ID:.*?X-FileName:.*?\n)'
    match = re.search(pattern, email_text, re.DOTALL | re.IGNORECASE)
    if match:
        metadata = match.group(1).strip()
        body = email_text[match.end():].strip()
    else:
        metadata, body = '', email_text.strip()
    return metadata, body


In [None]:
df_split1[['metadata_block', 'message_body']] = df_split1['message'].apply(
    lambda x: pd.Series(split_metadata_and_body(x))
)


In [None]:
df_split1.head(3)

# 3. Clean Dataset

In [None]:
df_split1['message_length'] = df_split1['message_body'].apply(len)

In [None]:
# Calculate Q1 and Q3
Q1 = df_split1['message_length'].quantile(0.25)
Q3 = df_split1['message_length'].quantile(0.75)
IQR = Q3 - Q1

# Set boundaries (we can't have messages with len lower than 1)
lower_bound = max(0, Q1 - 1.5 * IQR)
upper_bound = Q3 + 1.5 * IQR

print(f"Lower Outlier Limit: {lower_bound}")
print(f"Upper Outlier Limit: {upper_bound}")


In [None]:
df_clean2 = df_split1[df_split1['message_body'].str.len() < 3952].copy()

In [None]:
df_clean2.shape

In [None]:
df_clean2.head(2)

In [None]:
df_processed3 = df_clean2.copy()

In [None]:

def get_text_payload(msg):
    """
    Walks through a multipart email message and extracts the first text/plain payload.
    This is the most reliable way to get the email body while ignoring attachments.
    """
    if not msg.is_multipart():
        if msg.get_content_type() == 'text/plain':
            payload = msg.get_payload(decode=True)
            charset = msg.get_content_charset() or 'ascii'
            try:
                return payload.decode(charset, errors='replace')
            except (LookupError, TypeError):
                return payload.decode('ascii', errors='replace')
        else:
            return ""

    text_parts = []
    for part in msg.walk():
        payload = get_text_payload(part)
        if payload:
            text_parts.append(payload)

    return "\n".join(text_parts)

def final_scrub_text(text):
    """
    Performs a final, aggressive cleaning of the text to remove common email artifacts,
    PII, and other noise. This is the key function to ensure high-quality output.
    """
    # 1. Remove forwarded message headers that might have been missed
    # This catches "From:", "Sent:", "To:", "Subject:" lines at the start of a block.
    lines = text.split('\n')
    first_real_line_index = 0
    for i, line in enumerate(lines):
        if re.match(r'^\s*(from|sent|to|cc|subject|date|forwarded):', line, re.IGNORECASE):
            continue
        # Stop at the first line that is not a header-like line.
        if line.strip() not in ('', '>'):
            first_real_line_index = i
            break
    text = '\n'.join(lines[first_real_line_index:])

    # 2. Remove legal disclaimers and confidentiality notices
    # This targets common phrases found in corporate email footers.
    disclaimer_patterns = [
        r'\*+\s*original message\s*\*+',
        r'this e-mail is the property of enron corp\..*',
        r'the information contained in this communication is intended only for the use of the designated recipients.*',
        r'internet communications are not secure and therefore.*'
    ]
    for pattern in disclaimer_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)

    # 3. Anonymize Personal Information (PII)
    # Email addresses
    text = re.sub(r'\b[\w\.\-+=_%]+@[\w\.-]+\.\w{2,}\b', '<ANON_EMAIL>', text)
    # Phone/Fax numbers (various formats)
    text = re.sub(r'(\b(\+?\d{1,2}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b)', '<ANON_PHONE>', text)
    # Names (e.g., "Doe, John" or "John Doe")
    text = re.sub(r'\b[A-Z][a-z]+,\s[A-Z][a-z]+\b', '<ANON_NAME>', text)
    text = re.sub(r'\b[A-Z][a-z]+\s[A-Z][a-z]+\b', '<ANON_NAME>', text) # Catches simple Title Case names

    # 4. Remove other common email artifacts
    # URLs
    text = re.sub(r'https?://\S+|www\.\S+', '<ANON_URL>', text)
    # Quoted reply lines (e.g., "> blah blah blah")
    text = re.sub(r'^\s*>\s?.*$', '', text, flags=re.MULTILINE)
    # MIME encoding artifacts (e.g., "=20", "=0A")
    text = re.sub(r'=[0-9A-F]{2}', '', text)
    # Horizontal lines/separators
    text = re.sub(r'[-_*=]{3,}', '', text)

    # 5. Final whitespace cleanup
    # Remove excess blank lines
    text = re.sub(r'\n\s*\n', '\n', text)
    # Trim leading/trailing whitespace from the whole block
    return text.strip()


def format_and_split_thread(body_text):
    """
    Splits an email thread, tags replies and the original message.
    """
    blocks = re.split(r'-{5,}\s*Original Message\s*-{5,}', body_text, flags=re.IGNORECASE)

    # Filter out any empty blocks that might result from the split
    blocks = [block.strip() for block in blocks if block.strip()]

    if not blocks:
        return []

    tagged_messages = []
    num_blocks = len(blocks)
    for i, block in enumerate(blocks):
        # The first block is the newest message, the last is the oldest.
        if i == num_blocks - 1:
            tag = '<|original|>'
        else:
            tag = f'<|reply{num_blocks - 1 - i}|>'

        # Apply the final, aggressive scrub to each individual message block
        clean_text = final_scrub_text(block)

        # Only add the message if it's not empty after cleaning
        if clean_text:
            tagged_messages.append({'tag': tag, 'text': clean_text})

    return tagged_messages

def process_email_row(row):
    message_id = row['metadata_block'].split('\n')[0]  # Or however you extract it
    body_text = row['message_body']

    threaded_messages = format_and_split_thread(body_text)

    entries = []
    for part in threaded_messages:
        entries.append({
            'message_id': message_id,
            'tag': part['tag'],
            'clean_message': part['text']
        })

    return entries



In [None]:

if __name__ == '__main__':

    with Pool() as pool:
        results = pool.map(process_email_row, df_processed3.to_dict('records'))

    # Flatten list of lists
    cleaned = [item for sublist in results for item in sublist]
    df_processed3 = pd.DataFrame(cleaned)

    # Save or inspect
    # df_processed3.to_csv('enron_cleaned_v7.csv', index=False)
    # print(df_processed3.head())


In [None]:
df_processed3.head()

# Anon

In [None]:


# Load the small English model
nlp = spacy.load("en_core_web_sm")

def anonymize_names(text):
    doc = nlp(text)
    anonymized_tokens = []
    for token in doc:
        if token.ent_type_ == "PERSON":
            anonymized_tokens.append("[NAME]")
        else:
            anonymized_tokens.append(token.text)
    return " ".join(anonymized_tokens)


In [None]:
df_anon4 = df_processed3.copy()

In [None]:
df_anon4['anon_message'] = df_anon4['clean_message'].apply(anonymize_names)


In [None]:
df_anon4.head()

In [None]:
df_ready = df_anon4.copy()

In [None]:
from sklearn.model_selection import train_test_split

# Step 1: Shuffle and split into train and temp (val + test)
train_df, temp_df = train_test_split(df_ready, test_size=0.2, random_state=42)

# Step 2: Split temp into validation and test
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")


In [None]:
train_df.to_csv("v7_train.csv", index=False)
val_df.to_csv("v7_val.csv", index=False)
test_df.to_csv("v7_test.csv", index=False)
