# Phase 1: Data Collection

This notebook collects 500 Reddit posts about PCOS and extracts official diagnostic criteria.

## Steps:
1. Collect 500 posts from 4 PCOS subreddits
2. Extract official PCOS diagnostic criteria
3. Visualize collection results

In [None]:
import sys
sys.path.append('..')

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Step 1: Collect Reddit Posts

Target: 500 posts from 4 subreddits

In [None]:
from src.data_collection.reddit_collector import PCOSRedditCollector

# Initialize collector
collector = PCOSRedditCollector()

# Collect posts
posts = collector.collect_all()

# Save posts
collector.save_posts(posts)

## Step 2: Extract Official Criteria

In [None]:
from src.data_collection.extract_official_criteria import create_official_criteria, save_criteria

# Create and save criteria
criteria = create_official_criteria()
save_criteria(criteria)

print("\nDiagnostic Core Features:")
for feature in criteria['diagnostic_core_features']:
    print(f"  - {feature}")

## Step 3: Visualize Collection Results

In [None]:
# Load collected posts
with open('../data/raw/reddit_pcos_posts.json', 'r') as f:
    posts_data = json.load(f)

df = pd.DataFrame(posts_data)
print(f"Total posts collected: {len(df)}")
print(f"\nPosts per subreddit:")
print(df['subreddit'].value_counts())

In [None]:
# Visualize posts per subreddit
plt.figure(figsize=(10, 6))
subreddit_counts = df['subreddit'].value_counts()
subreddit_counts.plot(kind='bar', color='steelblue')
plt.title('Posts Collected per Subreddit', fontsize=14, fontweight='bold')
plt.xlabel('Subreddit', fontsize=12)
plt.ylabel('Number of Posts', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Convert timestamps
df['created_date'] = pd.to_datetime(df['created_utc'], unit='s')

# Plot posts over time
plt.figure(figsize=(12, 6))
df.set_index('created_date').resample('W').size().plot(kind='line', color='steelblue', linewidth=2)
plt.title('Posts Collected Over Time', fontsize=14, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Number of Posts per Week', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Post engagement stats
print("\nEngagement Statistics:")
print(f"Average score: {df['score'].mean():.1f}")
print(f"Average comments: {df['num_comments'].mean():.1f}")
print(f"Median score: {df['score'].median():.1f}")
print(f"Median comments: {df['num_comments'].median():.1f}")

In [None]:
# Distribution of comments per post
plt.figure(figsize=(10, 6))
plt.hist(df['num_comments'], bins=50, color='steelblue', edgecolor='black', alpha=0.7)
plt.axvline(df['num_comments'].mean(), color='red', linestyle='--', linewidth=2, label='Mean')
plt.axvline(df['num_comments'].median(), color='orange', linestyle='--', linewidth=2, label='Median')
plt.title('Distribution of Comments per Post', fontsize=14, fontweight='bold')
plt.xlabel('Number of Comments', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.legend()
plt.tight_layout()
plt.show()

## Summary

âœ… Data collection complete!

**Next Steps:**
1. Run Phase 2: LLM symptom extraction
2. Open notebook `02_llm_discovery.ipynb`