In [None]:
# ! pip install praw neptune-client==1.2.0

### Libraries to use

In [15]:
from dotenv import load_dotenv
import os
import neptune
import praw
from datetime import datetime
import json
import time
import pandas as pd
from tqdm import tqdm

In [20]:
def load_progress(filename="scraping_progress.json"):
    """Load previously scraped data if it exists"""
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
            return data.get('posts', []), data.get('last_post_id', None)
    except FileNotFoundError:
        return [], None

def save_progress(posts_data, last_post_id, filename="scraping_progress.json"):
    """Save current progress to file"""
    progress_data = {
        'posts': posts_data,
        'last_post_id': last_post_id,
        'saved_at': datetime.now().isoformat(),
        'total_posts': len(posts_data)
    }
    # Save progress as JSON (for resume functionality)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(progress_data, f, ensure_ascii=False, indent=2)
    
    # Also save current data as CSV
    if posts_data:
        df = pd.DataFrame(posts_data)
        df.to_csv("wallstreetbets_posts.csv", index=False, encoding='utf-8')


In [21]:
def save_final_csv(posts_data):
    """Save final dataset as CSV with proper formatting"""
    if not posts_data:
        return
        
    df = pd.DataFrame(posts_data)
    
    # Convert timestamp to readable format
    df['created_datetime'] = pd.to_datetime(df['created_utc'], unit='s')
    
    # Reorder columns for better readability
    columns_order = ['post_id', 'title', 'author_name', 'score', 'num_comments', 
                    'created_utc', 'created_datetime', 'url']
    df = df[columns_order]
    
    # Save with timestamp in filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = f"wallstreetbets_posts_{timestamp}.csv"
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    
    return csv_filename

In [17]:
def reddit_connect():
    """Initialize Reddit connection"""
    load_dotenv()
    reddit = praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT'),
        username=os.getenv('REDDIT_USERNAME'),
        password=os.getenv('REDDIT_PASSWORD')
    )
    return reddit

In [18]:
def build_dataset(reddit, target_posts=50):
    """Build the dataset by scraping WallStreetBets posts"""
    
    # Load previous progress
    posts_data, last_post_id = load_progress()
    start_count = len(posts_data)
    
    if start_count > 0:
        print(f"Resuming from {start_count} previously scraped posts...")
    else:
        print("Starting fresh scrape...")
    
    # Get subreddit
    subreddit = reddit.subreddit("wallstreetbets")
    
    # Track scraping metrics
    start_time = time.time()
    errors_count = 0
    
    try:
        # Get posts (PRAW handles pagination automatically)
        posts_generator = subreddit.new(limit=target_posts)
        
        # Convert to list to get total count for progress bar
        print("Fetching post list from Reddit...")
        all_posts = list(posts_generator)
        
        # Skip posts we already have if resuming
        if last_post_id:
            # Find where to resume
            resume_index = 0
            for i, post in enumerate(all_posts):
                if post.id == last_post_id:
                    resume_index = i + 1
                    break
            all_posts = all_posts[resume_index:]
            print(f"Resuming from post index {resume_index}")
        
        # Process remaining posts
        posts_to_process = min(len(all_posts), target_posts - start_count)
        
        for i, submission in enumerate(tqdm(all_posts[:posts_to_process], 
                                        desc="Scraping posts", 
                                        initial=start_count, 
                                        total=target_posts)):
            try:
                # Handle author safely
                author_name = "[deleted]"
                if submission.author is not None:
                    try:
                        author_name = submission.author.name
                    except Exception:
                        author_name = "[unavailable]"
                
                # Extract post data (keeping titles and emojis intact)
                post_info = {
                    "post_id": submission.id,
                    "title": submission.title,  # Preserves emojis and formatting
                    "score": submission.score,
                    "created_utc": submission.created_utc,
                    "num_comments": submission.num_comments,
                    "url": submission.url,
                    "author_name": author_name
                }
                
                posts_data.append(post_info)
                
                # Save progress every 50 posts
                if (len(posts_data) - start_count) % 50 == 0:
                    save_progress(posts_data, submission.id)
                
                # Small delay every 100 posts to be nice to Reddit
                if (len(posts_data) - start_count) % 100 == 0:
                    time.sleep(1)
                    
            except Exception as e:
                errors_count += 1
                print(f"Error processing post {submission.id}: {str(e)}")
                continue
        
        # Final save
        save_progress(posts_data, posts_data[-1]["post_id"] if posts_data else None)
        
        # Calculate final metrics
        end_time = time.time()
        scraping_duration = end_time - start_time
        
        # Save final CSV dataset
        csv_filename = save_final_csv(posts_data)
        
        # Return metrics for tracking
        metrics = {
            'total_posts_collected': len(posts_data),
            'new_posts_this_session': len(posts_data) - start_count,
            'scraping_duration_minutes': scraping_duration / 60,
            'errors_encountered': errors_count,
            'posts_per_minute': (len(posts_data) - start_count) / (scraping_duration / 60) if scraping_duration > 0 else 0,
            'csv_filename': csv_filename,
            'total_posts_available': len(all_posts),
            'resumed_from': start_count if start_count > 0 else None
        }
        
        print(f"\nDataset building completed!")
        print(f"Total posts collected: {metrics['total_posts_collected']}")
        print(f"New posts this session: {metrics['new_posts_this_session']}")
        print(f"Duration: {metrics['scraping_duration_minutes']:.2f} minutes")
        print(f"Errors: {metrics['errors_encountered']}")
        
        return posts_data, metrics
        
    except KeyboardInterrupt:
        print("\nScraping interrupted by user. Progress saved.")
        save_progress(posts_data, posts_data[-1]["post_id"] if posts_data else None)
        return posts_data, {'interrupted': True, 'posts_at_interruption': len(posts_data)}
        
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
        save_progress(posts_data, posts_data[-1]["post_id"] if posts_data else None)
        return posts_data, {'fatal_error': str(e), 'posts_at_error': len(posts_data)}

def track_scraping_metrics(posts_data, metrics, target_posts):
    """Track scraping metrics and results in Neptune"""
    # Initialize Neptune
    run = neptune.init_run(project="krishnadasm/wallstreetbets-scraper")
    
    # Log configuration
    run["config/target_posts"] = target_posts
    run["config/subreddit"] = "wallstreetbets"
    run["config/sort_method"] = "new"
    run["config/resume_enabled"] = True
    
    # Log scraping metrics
    if 'interrupted' in metrics:
        run["scraping/interrupted"] = True
        run["scraping/posts_at_interruption"] = metrics['posts_at_interruption']
    elif 'fatal_error' in metrics:
        run["scraping/fatal_error"] = metrics['fatal_error']
        run["scraping/posts_at_error"] = metrics['posts_at_error']
    else:
        # Log successful completion metrics
        run["results/total_posts_collected"] = metrics['total_posts_collected']
        run["results/new_posts_this_session"] = metrics['new_posts_this_session']
        run["results/scraping_duration_minutes"] = metrics['scraping_duration_minutes']
        run["results/errors_encountered"] = metrics['errors_encountered']
        run["results/posts_per_minute"] = metrics['posts_per_minute']
        run["scraping/total_posts_available"] = metrics['total_posts_available']
        
        if metrics['resumed_from']:
            run["scraping/resumed_from"] = metrics['resumed_from']
        
        # Upload final dataset to Neptune
        if metrics.get('csv_filename'):
            run["data/posts_dataset"].upload(metrics['csv_filename'])
        run["data/progress_file"].upload("scraping_progress.json")
    
    # Log final progress
    run["scraping/final_progress"] = len(posts_data)
    
    run.stop()
    print("Metrics logged to Neptune successfully!")


In [22]:

def scrape_wallstreetbets():
    """Main function that orchestrates the scraping process"""
    # Build the dataset
    reddit = reddit_connect()
    print("Connected to Reddit API successfully.")
    posts_data, metrics = build_dataset(reddit, target_posts=50)
    
    # Track metrics in Neptune
    track_scraping_metrics(posts_data, metrics, target_posts=50)
    
    return posts_data, metrics

if __name__ == "__main__":
    scrape_wallstreetbets()

Connected to Reddit API successfully.
Starting fresh scrape...
Fetching post list from Reddit...


Scraping posts: 100%|██████████| 50/50 [00:00<00:00, 1020.77it/s]



Dataset building completed!
Total posts collected: 50
New posts this session: 50
Duration: 0.04 minutes
Errors: 0
https://app.neptune.ai/krishnadasm/wallstreetbets-scraper/e/WAL-5
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 13 operations to synchronize with Neptune. Do not kill this process.
All 13 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/krishnadasm/wallstreetbets-scraper/e/WAL-5/metadata
Metrics logged to Neptune successfully!


Environments setup for Reddit and neptune.ai secrets

### Neptune.ai Setup

In [7]:
from dotenv import load_dotenv
load_dotenv()
# Test that it loaded
print("Token loaded:", os.getenv('NEPTUNE_API_TOKEN')[:10] + "...")  # Only shows first 10 chars

# neptune_api_token = user_secrets.get_secret("neptune_api")
run = None
try:
    run = neptune.init_run(
    project="krishnadasm/wallstreetbets-scraper"
    )
    run["test"] = "Connected with .env file!"
    run.stop()
except Exception as ex:
    print(f"Exception: {ex}")

Token loaded: eyJhcGlfYW...
https://app.neptune.ai/krishnadasm/wallstreetbets-scraper/e/WAL-4
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/krishnadasm/wallstreetbets-scraper/e/WAL-4/metadata


In [None]:
# ! pip install python-dotenv