## Project B0

When classifying emails, our goal is to distinguish emails as being spam or not spam (referred to as "ham"), using features generated from the text in the email. Our dataset is from Enron. It consists of email messages and their labels (0 for ham, 1 for spam). Your labeled training dataset contains 8,348 labeled examples, and the unlabeled test set contains 1,000 unlabeled examples.

Note: The dataset is from 2006, so the contents of emails might be very different from those in 2025.

Run the following cells to load the data into a DataFrame.

The train DataFrame contains labeled data you will use to train your model. It has four columns:
1. id: An identifier for the training example.
2. subject: The subject of the email.
3. email: The text of the email.
4. spam: 1 if the email is spam, 0 if the email is ham (not spam).
The test DataFrame contains 1,000 unlabeled emails. In Project B2, you will predict labels for these emails and submit your predictions to the autograder for evaluation.

### Import statements

In [7]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, HTML
import os
import random
from pathlib import Path

# Ensure widgets are enabled
widgets.Output()

Output()

In [3]:
# Get all email file paths
data_dir = Path('data')
ham_dir = Path('data/ham')
spam_dir = Path('data/spam')

# Get all ham and spam files recursively (in case there are subdirectories)
ham_files = list(ham_dir.rglob('*.txt'))  # rglob is recursive
spam_files = list(spam_dir.rglob('*.txt'))  # rglob is recursive

print(f"Found {len(ham_files)} ham emails and {len(spam_files)} spam emails")


Found 2971 ham emails and 1200 spam emails


In [5]:
# Create the email viewer widget
# Output area to display email content
output = widgets.Output()

# Current mode: 'ham' or 'spam'
current_mode = 'ham'

# Label to show current email info
info_label = widgets.HTML(value="<b>Mode: <span style='color:green'>HAM</span> | Click the button to load a random email</b>")

# Mode toggle button
mode_button = widgets.Button(
    description='Switch to SPAM',
    button_style='info',
    layout=widgets.Layout(width='200px', height='40px')
)

# Button to load random email
load_button = widgets.Button(
    description='Load Random Email',
    button_style='primary',
    layout=widgets.Layout(width='200px', height='40px')
)

def toggle_mode(b):
    """Toggle between ham and spam mode"""
    global current_mode
    
    if current_mode == 'ham':
        current_mode = 'spam'
        mode_button.description = 'Switch to HAM'
        mode_button.button_style = 'warning'
        info_label.value = "<b>Mode: <span style='color:red'>SPAM</span> | Click the button to load a random email</b>"
    else:
        current_mode = 'ham'
        mode_button.description = 'Switch to SPAM'
        mode_button.button_style = 'info'
        info_label.value = "<b>Mode: <span style='color:green'>HAM</span> | Click the button to load a random email</b>"

def load_random_email(b):
    """Load and display a random email based on current mode"""
    with output:
        output.clear_output(wait=True)
        
        # Use current mode instead of random choice
        email_type = current_mode
        
        if email_type == 'ham':
            file_path = random.choice(ham_files)
            type_color = 'green'
        else:
            file_path = random.choice(spam_files)
            type_color = 'red'
        
        # Read the email content
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            # Update info label
            file_name = file_path.name
            info_label.value = f"<b>Mode: <span style='color:{type_color}'>{email_type.upper()}</span> | Current: <code>{file_name}</code></b>"
            
            # Display the email
            print(f"Email Type: {email_type.upper()}")
            print(f"File: {file_name}")
            print(f"{'='*60}")
            print(content)
            print(f"{'='*60}")
            
        except Exception as e:
            print(f"Error reading file: {e}")

# Connect buttons to functions
mode_button.on_click(toggle_mode)
load_button.on_click(load_random_email)

# Create the widget layout
email_viewer = widgets.VBox([
    info_label,
    widgets.HBox([mode_button, load_button]),
    output
])

# Display the widget (make it the last expression so it auto-displays)
email_viewer

VBox(children=(HTML(value="<b>Mode: <span style='color:green'>HAM</span> | Click the button to load a random eâ€¦