## Step 5: Finalize Research Digest

In [8]:
import sys
sys.path.append('../')

import os

from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import markdown

from data import ContentManager
manager = ContentManager(base_path="../data")

from components.email.gmail_sender import GmailSender
sender = GmailSender(
    credentials_path="../gmail_client_secret.json",
    token_path="../gmail_token.json"
)

INFO:data.content_manager:Loaded content index with 2633 entries
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0


âœ“ Gmail API authenticated successfully


In [2]:
df = pd.read_csv("../data/research_items.csv")

In [3]:
# Filter for recently curated items
base_date = "2025-12-28"
base_time = "20:00"  # Optional time filter (HH:MM)

# Combine date and time into a datetime string
cutoff_datetime = f"{base_date}T{base_time}:00"

# Filter for items curated after the cutoff
df_recent = df[
    (df['curated_at'] > cutoff_datetime)
    &
    (~df['curator_summary'].str.startswith("ERROR", na=False))
    &
    (df['priority_score'] >= 8.0)
].copy()

# Sort by sum of scores (descending)
df_recent['total_score'] = (
    df_recent['applicability_score'] + 
    df_recent['novelty_score'] + 
    df_recent['priority_score']
)
df_recent = df_recent.sort_values('total_score', ascending=False).drop(columns=['total_score'])

print(f"Filtered {len(df_recent)} items from {len(df)}  ")

Filtered 43 items from 4355  


### Format for email

In [11]:
def format_top_items(df, n=10):
    """
    Format top N items as a numbered list with full details.
    
    Args:
        df: DataFrame sorted by priority (highest first)
        n: Number of top items to format (default: 10)
    
    Returns:
        str: Markdown-formatted list of top items
    """
    items = []
    
    for idx, (_, row) in enumerate(df.head(n).iterrows(), 1):
        # Format published date (handle different formats)
        published = str(row['published']) if pd.notna(row['published']) else 'N/A'
        
        item = f"""**{idx}. {row['title']}**

**URL:** {row['url']}  
**Tags:** {row['curator_tags']}  
**Source:** {row['source']} - {published}

**Takeaways:**  
{row['curator_takeaways']}

**Reason to read:**  
{row['verdict_reasoning']}

---
"""
        items.append(item)
    
    return '\n'.join(items)


def format_remaining_table(df, skip_top=10):
    """
    Format remaining items as a markdown table.
    
    Args:
        df: DataFrame sorted by priority
        skip_top: Number of top items to skip (default: 10)
    
    Returns:
        str: Markdown-formatted table
    """
    if len(df) <= skip_top:
        return ""
    
    # Select rows after top N
    df_table = df.iloc[skip_top:].copy()
    
    # Select and rename columns for cleaner display
    cols = ['title', 'url', 'source', 'published', 'curator_tags', 'curator_takeaways', 'curator_summary']
    df_display = df_table[cols].copy()
    
    # Truncate long text fields for table readability
    df_display['url'] = df_display['url'].apply(lambda x: f'[Link]({x})')
    
    # Convert to markdown table
    markdown_table = df_display.to_markdown(index=False)
    
    return f"\n\n## Additional Items\n\n{markdown_table}"

def format_remaining_table(df, skip_top=10):
    """
    Format remaining items as an HTML table.
    
    Args:
        df: DataFrame sorted by priority
        skip_top: Number of top items to skip (default: 10)
    
    Returns:
        str: HTML-formatted table
    """
    if len(df) <= skip_top:
        return ""
    
    # Select rows after top N
    df_table = df.iloc[skip_top:].copy()
    
    # Select columns for display
    cols = ['title', 'url', 'source', 'published', 'curator_tags', 'curator_takeaways', 'curator_summary']
    df_display = df_table[cols].copy()
    
    # Shorten title for readability
    df_display['title'] = df_display['title'].str[:80]
    
    # Make URL clickable
    df_display['url'] = df_display['url'].apply(lambda x: f'<a href="{x}">Link</a>')
    
    # Convert to HTML table with styling
    html_table = df_display.to_html(
        index=False,
        escape=False,  # Allow HTML in cells (for links)
        border=1,
        classes='digest-table'
    )
    
    # Add some basic inline CSS for better rendering
    styled_table = f"""
<h2>Additional Items</h2>
<style>
    .digest-table {{
        border-collapse: collapse;
        width: 100%;
        margin-top: 20px;
    }}
    .digest-table th {{
        background-color: #f2f2f2;
        padding: 12px;
        text-align: left;
        border: 1px solid #ddd;
    }}
    .digest-table td {{
        padding: 10px;
        border: 1px solid #ddd;
        vertical-align: top;
    }}
    .digest-table tr:nth-child(even) {{
        background-color: #f9f9f9;
    }}
</style>
{html_table}
"""
    
    return styled_table

Format

In [12]:
# Format the digest
top_section = format_top_items(df_recent, n=10)
top_html = markdown.markdown(top_section)
remaining_section = format_remaining_table(df_recent, skip_top=10)

# Combine into full HTML digest
digest_body = f"""
<html>
<body>
<h1>ðŸ”¬ Research Digest</h1>
<p><strong>Date:</strong> {pd.Timestamp.now().strftime('%B %d, %Y')}</p>

<hr>

<h2>ðŸŒŸ Top 10 Highlights</h2>

{top_html}

{remaining_section}

</body>
</html>
"""

Send

In [13]:
# Send the digest
sender.send_email(
    to=os.getenv("EMAIL_TO"),
    subject=f"Research Digest - {pd.Timestamp.now().strftime('%B %d, %Y')}",
    content=digest_body,
    html=True,
    #markdown_mode=True
)

âœ“ Email sent successfully to kri***com
  Message ID: 19b668bd3fdb306c


True