### Libraries

In [3]:
from dotenv import load_dotenv
import os
import neptune
import praw
from datetime import datetime
import json
import time
import pandas as pd
from tqdm import tqdm

from pathlib import Path
Root = Path('.').absolute().parent
SCRIPTS = Root / r'scripts'
# SCRIPTS = Root / r'C:\Users\Admin\Projects\ML Projects\ManipDetect\research\scripts'
DATA = Root/ r'C:\Users\krishnadas\Projects\ML Projects\ManipDetect\data'

In [4]:
# check how many posts are there
# %pip install psaw
################# Too slow #################
# from psaw import PushshiftAPI
# import pandas as pd

# api = PushshiftAPI()
# subreddit_name = "Wallstreetbetsnew"  # Replace with the desired subreddit
# api_request_generator = api.search_submissions(subreddit=subreddit_name, score=">=0")
# submissions = pd.DataFrame([submission for submission in api_request_generator])
# total_posts = len(submissions)
# print(f"Total posts in r/{subreddit_name}: {total_posts}")

### Reddit connection

In [13]:
def reddit_connect():
    """Initialize Reddit connection"""
    load_dotenv()
    reddit = praw.Reddit(
        client_id=os.getenv('REDDIT_CLIENT_ID'),
        client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
        user_agent=os.getenv('REDDIT_USER_AGENT'),
        username=os.getenv('REDDIT_USERNAME'),
        password=os.getenv('REDDIT_PASSWORD')
    )
    return reddit

In [7]:
# Test scraping with a single post without function
reddit = reddit_connect()
subreddit = reddit.subreddit("wallstreetbetsnew")
posts = subreddit.new(limit=2)
for post in posts:
    print(f"Post ID: {post.id}")
    print(f"Title: {post.title}")
    print(f"Text: {post.selftext if post.selftext else 'N/A'}")
    print(f"Author: {post.author.name if post.author else 'N/A'}")
    print(f'author id: {post.author.id if post.author else "N/A"}')
    print(f"Created: {datetime.fromtimestamp(post.created_utc)}")
    print(f'Created UTC: {post.created_utc}')
    print(f"URL: {post.url}")
    print(f"Score: {post.score}")
    print(f"Comments: {post.num_comments}")
    print("-" * 40)

Post ID: 1ltjdp7
Title: BMNR & RGC Explode! 🔥 Massive Moves You Can’t Miss!
Text: 
🚀 $BMNR and $RGC are blasting off – and this video breaks it all down! We dive into the jaw-dropping price spikes, the catalysts behind the runs, and what to watch next. Whether you're riding the momentum or just curious, we cover:

📈 BMNR’s staggering % gains
🔍 What’s fueling the surge in RGC
🎯 Key resistance levels & entry zones
🔔 Risk tips & what could flip the script

👉 Hit LIKE if you're hyped, COMMENT with your take, and SUBSCRIBE for daily market breakdowns!

https://youtu.be/43-Z2T3nLlI?si=xg_F0B4iT-ayqRl1


Author: Mino3621
author id: rqvik0bj
Created: 2025-07-07 04:58:19
Created UTC: 1751857099.0
URL: https://www.reddit.com/r/Wallstreetbetsnew/comments/1ltjdp7/bmnr_rgc_explode_massive_moves_you_cant_miss/
Score: 2
Comments: 1
----------------------------------------
Post ID: 1ltefwz
Title: Two Special Situations with Near Term Catalysts: $DATS and $HIT;
Two After Hours Market Movers: $MDIA, $CG

### Build data set

In [14]:
def load_progress(filename="scraping_progress.json"):
    """Load previously scraped data if it exists"""
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
            posts = data.get('posts', [])
            
            # Normalize old data structure to ensure consistency
            consistent_posts = []
            for post in posts:
                # Ensure all required fields exist with default values
                consistent_post_dict = {
                    'post_id': post.get('post_id', ''),
                    'title': post.get('title', ''),
                    'text': post.get('text', post.get('selftext', '')),  # Handle old 'selftext' field
                    'post_type': post.get('post_type', 'unknown'),
                    'author_name': post.get('author_name', post.get('author', '[unknown]')),  # Handle old 'author' field
                    'author_id':post.get('author_id', post.get('author_id', '')),
                    'score': post.get('score', 0),
                    'num_comments': post.get('num_comments', 0),
                    'created_utc': post.get('created_utc', 0),
                    'url': post.get('url', '')
                }
                consistent_posts.append(consistent_post_dict)
            
            return consistent_posts, data.get('last_post_id', None)
    except FileNotFoundError:
        return [], None

def save_progress(posts_data, last_post_id, filename="scraping_progress.json"):
    """Save current progress to file"""
    progress_data = {
        'posts': posts_data,
        'last_post_id': last_post_id,
        'saved_at': datetime.now().isoformat(),
        'total_posts': len(posts_data)
    }
    # Save progress as JSON (for resume functionality)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(progress_data, f, ensure_ascii=False, indent=2)
    
    # Also save current data as CSV
    if posts_data:
        df = pd.DataFrame(posts_data)
        df.to_csv("wallstreetbetsnew_posts.csv", index=False, encoding='utf-8')


In [15]:
def save_final_csv(posts_data, filepath):
    """Save final dataset as CSV with proper formatting"""
    if not posts_data:
        return
        
    df = pd.DataFrame(posts_data)
    
    # Convert timestamp to readable format
    df['created_datetime'] = pd.to_datetime(df['created_utc'], unit='s')
    
    # Reorder columns for better readability, but only use columns that exist
    preferred_columns_order = ['post_id', 'title', 'text', 'post_type', 'author_name', 'author_id', 'score', 'num_comments', 
                                'created_utc', 'url']
    
    # Filter to only include columns that actually exist in the DataFrame
    available_columns = [col for col in preferred_columns_order if col in df.columns]
    
    # Add any remaining columns that weren't in our preferred order
    remaining_columns = [col for col in df.columns if col not in available_columns]
    final_columns_order = available_columns + remaining_columns
    
    df = df[final_columns_order]
    
    # Save with timestamp in filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = filepath/f"wallstreetbetsnew_posts_{timestamp}.csv"
    df.to_csv(csv_filename, index=False, encoding='utf-8')
    
    return csv_filename


In [24]:
def build_dataset(reddit, target_posts=10):
    """Build the dataset by scraping WallStreetBetsnew posts"""
    
    # Load previous progress
    filepath = r'C:\Users\krishnadas\Projects\ML Projects\ManipDetect\research\scripts\temp_data'  # Define the path to save progress
    posts_data, last_post_id = load_progress()
    start_count = len(posts_data)
    
    if start_count > 0:
        print(f"Resuming from {start_count} previously scraped posts...")
    else:
        print("Starting fresh scrape...")
    
    # Get subreddit
    # subreddit = reddit.subreddit("wallstreetbets")
    # change to new subreddit
    subreddit = reddit.subreddit("wallstreetbetsnew")

    
    # Track scraping metrics
    start_time = time.time()
    errors_count = 0
    
    try:
        # Get posts (PRAW handles pagination automatically)
        posts_generator = subreddit.new(limit=target_posts)
        
        # Convert to list to get total count for progress bar
        print("Fetching post list from Reddit...")
        all_posts = list(posts_generator)
        
        # Skip posts we already have if resuming
        if last_post_id:
            # Find where to resume
            resume_index = 0
            for i, post in enumerate(all_posts):
                if post.id == last_post_id:
                    resume_index = i + 1
                    break
            all_posts = all_posts[resume_index:]
            print(f"Resuming from post index {resume_index}")
        
        # Process remaining posts
        posts_to_process = min(len(all_posts), target_posts - start_count)
        
        for i, submission in enumerate(tqdm(all_posts[:posts_to_process], 
                                        desc="Scraping posts", 
                                        initial=start_count, 
                                        total=target_posts)):
            try:
                # Handle author safely
                author_name = "[deleted]"
                author_id = None
                if submission.author is not None:
                    try:
                        author_name = submission.author.name
                        author_id = submission.author.id
                    except Exception:
                        author_name = "[unavailable]"
                        author_id = None
                
                # Extract post text content with better categorization
                post_text = ""
                post_type = "text"  # Default type
                
                if submission.is_self:  # Text post
                    if submission.selftext:
                        post_text = submission.selftext
                        post_type = "text"
                    else:
                        post_text = "[Empty text post]"
                        post_type = "text_empty"
                else:  # Link post
                    post_text = "[Link Post]"
                    post_type = "link"
                    
                    # You could also categorize by URL type
                    if any(img_ext in submission.url.lower() for img_ext in ['.jpg', '.jpeg', '.png', '.gif']):
                        post_type = "image"
                    elif 'youtube.com' in submission.url.lower() or 'youtu.be' in submission.url.lower():
                        post_type = "video"
                
                # Extract post data (keeping titles and emojis intact)
                post_info = {
                    "post_id": submission.id,
                    "title": submission.title,  # Preserves emojis and formatting
                    "text": post_text,  # The actual post content
                    "post_type": post_type,  # Type of post for analysis
                    "author_name": author_name,  # Author's name
                    "author_id": author_id,  # Author's ID
                    "score": submission.score,
                    "created_utc": submission.created_utc,
                    "num_comments": submission.num_comments,
                    "url": submission.url,
                }
                
                posts_data.append(post_info)
                
                # Save progress every 50 posts
                if (len(posts_data) - start_count) % 50 == 0:
                    save_progress(posts_data, submission.id)
                
                # Small delay every 100 posts to be nice to Reddit
                if (len(posts_data) - start_count) % 100 == 0:
                    time.sleep(1)
                    
            except Exception as e:
                errors_count += 1
                print(f"Error processing post {submission.id}: {str(e)}")
                continue
        
        # Final save
        save_progress(posts_data, posts_data[-1]["post_id"] if posts_data else None)
        
        # Calculate final metrics
        end_time = time.time()
        scraping_duration = end_time - start_time
        
        # Save final CSV dataset
        csv_filename = save_final_csv(posts_data, filepath)
        
        # Return metrics for tracking
        metrics = {
            'total_posts_collected': len(posts_data),
            'new_posts_this_session': len(posts_data) - start_count,
            'scraping_duration_minutes': scraping_duration / 60,
            'errors_encountered': errors_count,
            'posts_per_minute': (len(posts_data) - start_count) / (scraping_duration / 60) if scraping_duration > 0 else 0,
            'csv_filename': str(csv_filename),
            'total_posts_available': len(all_posts),
            'resumed_from': start_count if start_count > 0 else None
        }
        
        print(f"\nDataset building completed!")
        print(f"Total posts collected: {metrics['total_posts_collected']}")
        print(f"New posts this session: {metrics['new_posts_this_session']}")
        print(f"Duration: {metrics['scraping_duration_minutes']:.2f} minutes")
        print(f"Errors: {metrics['errors_encountered']}")
        
        return posts_data, metrics
        
    except KeyboardInterrupt:
        print("\nScraping interrupted by user. Progress saved.")
        save_progress(posts_data, posts_data[-1]["post_id"] if posts_data else None)
        return posts_data, {'interrupted': True, 'posts_at_interruption': len(posts_data)}
        
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
        save_progress(posts_data, posts_data[-1]["post_id"] if posts_data else None)
        return posts_data, {'fatal_error': str(e), 'posts_at_error': len(posts_data)}


In [25]:

def track_scraping_metrics(posts_data, metrics, target_posts):
    """Track scraping metrics and results in Neptune"""
    # Initialize Neptune
    run = neptune.init_run(project="krishnadasm/wallstreetbets-scraper")
    
    # Log configuration
    run["config/target_posts"] = target_posts
    run["config/subreddit"] = "wallstreetbetsnew"
    run["config/sort_method"] = "new"
    run["config/resume_enabled"] = True
    
    # Log scraping metrics
    if 'interrupted' in metrics:
        run["scraping/interrupted"] = True
        run["scraping/posts_at_interruption"] = metrics['posts_at_interruption']
    elif 'fatal_error' in metrics:
        run["scraping/fatal_error"] = metrics['fatal_error']
        run["scraping/posts_at_error"] = metrics['posts_at_error']
    else:
        # Log successful completion metrics
        run["results/total_posts_collected"] = metrics['total_posts_collected']
        run["results/new_posts_this_session"] = metrics['new_posts_this_session']
        run["results/scraping_duration_minutes"] = metrics['scraping_duration_minutes']
        run["results/errors_encountered"] = metrics['errors_encountered']
        run["results/posts_per_minute"] = metrics['posts_per_minute']
        run["scraping/total_posts_available"] = metrics['total_posts_available']
        
        if metrics['resumed_from']:
            run["scraping/resumed_from"] = metrics['resumed_from']
        
        # Upload final dataset to Neptune
        if metrics.get('csv_filename'):
            run["data/posts_dataset"].upload(metrics['csv_filename'])
        run["data/progress_file"].upload("scraping_progress.json")
    
    # Log final progress
    run["scraping/final_progress"] = len(posts_data)
    
    run.stop()
    print("Metrics logged to Neptune successfully!")


In [26]:
def scrape_wallstreetbetsnew():
    """Main function that orchestrates the scraping process"""
    # Build the dataset
    reddit = reddit_connect()
    print("Connected to Reddit API successfully.")
    posts_data, metrics = build_dataset(reddit, target_posts=10)
    
    # Track metrics in Neptune
    track_scraping_metrics(posts_data, metrics, target_posts=10)
    
    return posts_data, metrics

if __name__ == "__main__":
    scrape_wallstreetbetsnew()
    

Connected to Reddit API successfully.
Resuming from 10 previously scraped posts...
Fetching post list from Reddit...
Resuming from post index 0


Scraping posts: 100%|██████████| 10/10 [00:00<?, ?it/s]

Unexpected error: unsupported operand type(s) for /: 'str' and 'str'
[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/krishnadasm/wallstreetbets-scraper/e/WAL-38
[neptune] [info   ] Shutting down background jobs, please wait a moment...





[neptune] [info   ] Done!
[neptune] [info   ] Waiting for the remaining 13 operations to synchronize with Neptune. Do not kill this process.
[neptune] [info   ] All 13 operations synced, thanks for waiting!
[neptune] [info   ] Explore the metadata in the Neptune app: https://app.neptune.ai/krishnadasm/wallstreetbets-scraper/e/WAL-38/metadata
Metrics logged to Neptune successfully!


In [28]:
df = pd.read_csv("wallstreetbetsnew_posts.csv")
df.head()

Unnamed: 0,post_id,title,text,post_type,author_name,author_id,score,num_comments,created_utc,url
0,1ltjdp7,BMNR & RGC Explode! 🔥 Massive Moves You Can’t ...,\n🚀 $BMNR and $RGC are blasting off – and this...,text,Mino3621,rqvik0bj,2,1,1751857000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...
1,1ltefwz,Two Special Situations with Near Term Catalyst...,"**After Hours Market Movers--$MDIA, $CGTX--on ...",text,Marketspike,a159k26r,3,0,1751842000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...
2,1ltcd94,Investment Opportunity of a Lifetime - $215M M...,[https://stocktwits.com/Gps\_100X\_ROI\_Potent...,text,Run4theRoses2,60p5ejtz,5,0,1751837000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...
3,1lsi2f1,AGMH Maby look at this?,\n\nAGMH signed a deal to sell a subsidiary fo...,text,mariusvell,nspvrhts,30,0,1751744000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...
4,1lsacg0,Why invest in handsome?,So why invest in handsome… in beautiful… in br...,text,Future_Fund2025,1ivpzm4bpm,0,2,1751723000.0,https://www.reddit.com/r/Wallstreetbetsnew/com...


### Testing

In [39]:
with open('scraping_progress.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    posts = data.get('posts',[])
    consistent_posts = []
    for post in posts:
        consistent_post_dict = {
        'post_id': post.get('post_id', ''),
        'title': post.get('title', ''),
        'text': post.get('text', post.get('selftext', '')),  # Handle old 'selftext' field
        'post_type': post.get('post_type', 'unknown'),
        'author_name': post.get('author_name', post.get('author', '[unknown]')),  # Handle old 'author' field
        'author_id': post.get('author_id', post.get('author_id', '')),
        'score': post.get('score', 0),
        'num_comments': post.get('num_comments', 0),
        'created_utc': post.get('created_utc', 0),
        'url': post.get('url', '')
        }
        consistent_posts.append(consistent_post_dict)

In [49]:
consistent_posts[3].get('post_id')

'1lsi2f1'

In [53]:
x=list(set([post.get('post_id') for post in consistent_posts]))

In [None]:
a = [1, 2,3,4,5,6,7,9,0]
a[:2]

[1, 2]

: 