In [4]:
from dotenv import load_dotenv
import os
import neptune
import praw
from datetime import datetime
import json
import time
import pandas as pd
from tqdm import tqdm

from pathlib import Path
Root = Path('.').absolute().parent
# DATA = Root / r'C:\Users\Admin\Projects\ML Projects\ManipDetect\data'
DATA = Root/ r'C:\Users\krishnadas\Projects\ML Projects\ManipDetect\data'

In [2]:
def reddit_connect():
    """Initialize Reddit connection"""
    load_dotenv()
    reddit = praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT'),
        username=os.getenv('REDDIT_USERNAME'),
        password=os.getenv('REDDIT_PASSWORD')
    )
    return reddit

In [3]:
def lookup_missing_authors(csv_file=None, post_ids=None):
    """
    Lookup missing author names for posts by ID
    
    Args:
        csv_file: Path to CSV file with post_id column
        post_ids: List of post IDs to lookup
    
    Returns:
        Dictionary mapping post_id to author_name
    """
    # Initialize Reddit
    reddit = reddit_connect
    
    # Get post IDs from CSV or use provided list
    if csv_file:
        df = pd.read_csv(csv_file)
        post_ids = df['post_id'].tolist()
    
    if not post_ids:
        print("No post IDs provided")
        return {}
    
    author_lookup = {}
    errors = 0
    
    print(f"Looking up authors for {len(post_ids)} posts...")
    
    for i, post_id in enumerate(tqdm(post_ids, desc="Looking up authors")):
        try:
            submission = reddit.submission(id=post_id)
            
            if submission.author is not None:
                author_lookup[post_id] = submission.author.name
            else:
                author_lookup[post_id] = "[deleted]"
                
        except Exception as e:
            print(f"Error with post {post_id}: {e}")
            author_lookup[post_id] = "[error]"
            errors += 1
        
        # Rate limiting - be nice to Reddit
        if i % 50 == 0 and i > 0:
            time.sleep(1)
    
    print(f"Lookup completed. {errors} errors encountered.")
    return author_lookup

def update_csv_with_authors(csv_file, author_lookup):
    """Update CSV file with looked-up author names"""
    df = pd.read_csv(csv_file)
    
    # Update author names
    for post_id, author_name in author_lookup.items():
        mask = df['post_id'] == post_id
        df.loc[mask, 'author_name'] = author_name
    
    # Save updated file
    updated_filename = csv_file.replace('.csv', '_with_authors.csv')
    df.to_csv(updated_filename, index=False)
    
    print(f"Updated CSV saved as: {updated_filename}")
    return df

In [5]:
filepath = DATA / 'reddit_wsb.csv'
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56
