# News Data Processing: REDDIT

In [1]:
import pandas as pd
import numpy as np
import praw # Python Reddit API Wrapper
import json
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import datetime
from dotenv import load_dotenv
import os
import datetime as dt
import re
import time

## Connect to Reddit API

In [3]:
load_dotenv()
client_id = os.getenv('REDDIT_CLIENT_ID')
client_secret = os.getenv('REDDIT_CLIENT_SECRET')
user_agent = 'MLCapstoneProject by /u/Wise-Reward5805'
username =  'Wise-Reward5805'
password = os.getenv('REDDIT_PASSWORD')

In [4]:
def create_reddit_object(client_id, client_secret, user_agent, username, password):
    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent,
        username=username,
        password=password
    )
    return reddit

In [5]:
reddit = create_reddit_object(client_id, client_secret, user_agent, username, password)
print(reddit.user.me())  # Verify the authentication by printing the username


Wise-Reward5805


## Get Potential Universe

**Data Downloaded from WRDS (Up to December 31, 2022)**

#Use this script to webscrape changes to the SP500 and use it to update investment universe


url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#Selected_changes_to_the_list_of_S&P_500_components'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
req = Request(url, headers=headers)
html = urlopen(req)
soup = BeautifulSoup(html, 'html.parser')
table_list = soup.find_all('table')
table = table_list[1]  # The second list contains changes to the S&P consituents

table_headers = []
for th in table.find_all('th'):
    table_headers.append(th.text.strip())

rows = []
for tr in table.find_all('tr')[1:]:
    cells = [td.text.strip() for td in tr.find_all('td')]
    if cells:
        rows.append(cells)

spy_changes_df = pd.DataFrame(rows, columns=table_headers[:len(rows[0])])
spy_changes_df.columns = ['Effective Date', 'Added Ticker', 'Added Security', 'Removed Ticker', 'Removed Security', 'Reason']
spy_changes_df['Effective Date'] = pd.to_datetime(spy_changes_df['Effective Date'])
spy_changes_df.to_excel('../data/spy_changes.xlsx', index=False)

In [6]:
spy_constituents = pd.read_excel('../data/sp_500_constituents.xlsx')
spy_constituents.head()

Unnamed: 0,PERMNO,Company Name,Ticker,SP500 Start,SP500 End,Search Keywords
0,10104,ORACLE CORP,ORCL,"Aug. 3, 1989","Dec. 31, 2024","ORCL, Oracle, Oracle Corp"
1,10107,MICROSOFT CORP,MSFT,1994-06-07 00:00:00,"Dec. 31, 2024","MSFT, Microsoft, Microsoft Corp"
2,10138,T ROWE PRICE GROUP INC,TROW,"Oct. 13, 1999","Dec. 31, 2024","T Rowe Price, T Rowe Price Group Inc, TROW, T ..."
3,10145,HONEYWELL INTERNATIONAL INC,HON,"Dec. 31, 1925","Dec. 31, 2024","HON, Honeywell, Honeywell International Inc"
4,10516,ARCHER DANIELS MIDLAND CO,ADM,1981-07-30 00:00:00,"Dec. 31, 2024","ADM, Archer Daniels Midland, Archer Daniels Mi..."


In [7]:
all_permnos = spy_constituents['PERMNO'].tolist()
query_dict = {}
for index, row in spy_constituents.iterrows():
    key = row['PERMNO']
    values = row['Search Keywords'].split(', ')
    query_dict[key] = values

In [10]:
for i in query_dict.values():
    print(i)

['ORCL', 'Oracle', 'Oracle Corp']
['MSFT', 'Microsoft', 'Microsoft Corp']
['T Rowe Price', 'T Rowe Price Group Inc', 'TROW', 'T Rowe']
['HON', 'Honeywell', 'Honeywell International Inc']
['ADM', 'Archer Daniels Midland', 'Archer Daniels Midland Co', 'Archer Daniels']
['FISV', 'Fiserv', 'Fiserv Inc']
['Coca Cola', ' Cocala', ' KO', 'Coke']
['CDNS', 'Cadence Design', 'Cadence Design Systems', 'Cadence Design Systems Inc']
['Consolidated Edison', 'Consolidated Edison Inc', 'ED']
['Dentsply Sirona', 'Dentsply Sirona Inc', 'XRAY']
['FAST', 'Fastenal', 'Fastenal Co']
['DTE Energy', 'DTE Energy Co', 'DTE']
['ETN', 'Eaton', 'Eaton Corp', 'Eaton Corp Plc']
['SVB Financial', 'SVB Financial Group', 'SIVB', 'Silicon Valley Bank']
['Exxon Mobil', 'Exxon Mobil Corp', 'XOM']
['MGM Resorts', 'MGM Resorts International', 'MGM']
['WM', 'Waste Management Del', 'Waste Management Inc Del']
['GD', 'General Dynamics', 'General Dynamics Corp']
['GE', 'General Electric', 'General Electric Co']
['LH', 'Laborato

## Query News from Subreddits

Subreddits of interest
* r/stocks
* r/investing
* r/wallstreetbets
* r/ValueInvesting
* r/StockMarket
* r/SecurityAnalysis
* r/finance

In [12]:
subreddits = ['stocks', 'investing', 'wallstreetbets', 'ValueInvesting', 'StockMarket', 'SecurityAnalysis', 'finance']
news_df = pd.DataFrame()
for subreddit in subreddits:
    
    subred = reddit.subreddit(subreddit)
    print(f"Pulling Data From Subreddit: {subreddit}")


Pulling Data From Subreddit: stocks
Pulling Data From Subreddit: investing
Pulling Data From Subreddit: wallstreetbets
Pulling Data From Subreddit: ValueInvesting
Pulling Data From Subreddit: StockMarket
Pulling Data From Subreddit: SecurityAnalysis
Pulling Data From Subreddit: finance


In [13]:
# --- 3. DEFINE FILTERING THRESHOLDS ---
MIN_SUBMISSION_SCORE = 20       # Minimum upvotes for a post to be considered
MIN_COMMENT_SCORE = 5           # Minimum upvotes for a comment to be considered
MIN_USER_KARMA = 100            # Minimum karma for a commenter's account
MIN_ACCOUNT_AGE_DAYS = 90       # Minimum age of a commenter's account in days

scraped_data = []

In [14]:
subred = reddit.subreddit('stocks')

In [2]:
def _mk_query(aliases, extra=None):
    alias_q = " OR ".join([f'"{a}"' for a in aliases])
    return alias_q

In [None]:
 # Perform the search for submissions within the last 5 years
# Note: Reddit search is powerful but may not be perfectly exhaustive for long timeframes.
from time import time


WHITELISTED_DOMAINS = [
    'reuters.com', 'apnews.com', 'bloomberg.com', 'wsj.com',
    'cnbc.com', 'forbes.com', 'marketwatch.com', 'nytimes.com', 'alphasense.com'
]
company_name="Oracle"
ticker="ORCL"
sub_name="stocks"
query = f"{query_dict[all_permnos[0]][0]} OR {query_dict[all_permnos[0]][1]} OR {query_dict[all_permnos[0]][2]}"
for submission in subred.search(query, sort='relevance', time_filter='all', limit=100):
    
    # --- APPLY SUBMISSION FILTERS ---
    # 1. Filter by whitelisted domain
    # 2. Filter by score
    is_high_score = submission.score >= MIN_SUBMISSION_SCORE

    if is_high_score:
        print(f"  -> Found relevant post: '{submission.title}' in r/{sub_name}")
        
        # Fetch comments
        submission.comments.replace_more(limit=0) # Get all top-level comments
        for comment in submission.comments:
            # --- START: MODIFIED BLOCK ---
            try:
                # Avoid deleted comments or users from the start
                if comment.author is None or comment.body == '[deleted]':
                    continue

                # --- APPLY COMMENT/USER FILTERS ---
                account_age_days = (dt.datetime.now(dt.timezone.utc) - dt.datetime.fromtimestamp(comment.author.created_utc, tz=dt.timezone.utc)).days

                if comment.score >= MIN_COMMENT_SCORE and \
                    comment.author.comment_karma >= MIN_USER_KARMA and \
                    account_age_days >= MIN_ACCOUNT_AGE_DAYS:
                    
                    # Add the data to our list
                    scraped_data.append({
                        'company': company_name,
                        'ticker': ticker,
                        'subreddit': sub_name,
                        'submission_title': submission.title,
                        'submission_score': submission.score,
                        'submission_url': submission.url,
                        'submission_date': dt.datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d'),
                        'comment_body': comment.body,
                        'comment_score': comment.score,
                        'comment_author_karma': comment.author.comment_karma,
                        'comment_date': dt.datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d')
                    })
                time.sleep(0.7)  # Be respectful to Reddit's API

            except AttributeError:
                # This will catch comments from suspended users that cause the error
                # print(f"  -> Skipping comment from a suspended or deleted user.")
                continue


  -> Found relevant post: 'TikTok’s Algorithm to Be Secured by Oracle in Trump-Backed Deal' in r/stocks
  -> Found relevant post: 'Oracle Corp. (ORCL) Agrees to Buy Medical-Records Systems Provider Cerner Corp. (CERN) for $28.3 Billion All-Cash Deal' in r/stocks
  -> Found relevant post: 'Oracle Chosen as Winner in Deal for TikTok’s U.S. Operations' in r/stocks
  -> Found relevant post: 'The New Normal - 25% of the S&P 100 companies are >70 RSI signaling "overbought"' in r/stocks
  -> Found relevant post: 'Here is a Market Recap for today Tuesday, March 9, 2021. Please enjoy!' in r/stocks
  -> Found relevant post: 'Wall Street Week Ahead for the trading week beginning June 15th, 2020' in r/stocks
  -> Found relevant post: 'Get ready for the trading week beginning December 17th, 2018!' in r/stocks
  -> Found relevant post: 'Here is a Market Recap for today Friday, December 10, 2021. Please enjoy!' in r/stocks
  -> Found relevant post: 'Your AM Global Stocks Preview and a whole lot more 

TooManyRequests: received 429 HTTP response

In [42]:
scraped_data

[]

In [None]:
for search in query_dict.values():
    # Create a search query
    query = _mk_query(aliases=search)
    print(f"--- Searching for: {query} ---")

    # Loop through each target subreddit
    for sub_name in subreddits:
        subreddit = reddit.subreddit(sub_name)
        
        # Perform the search for submissions within the last 5 years
        # Note: Reddit search is powerful but may not be perfectly exhaustive for long timeframes.
        for submission in subreddit.search(query, sort='relevance', time_filter='all', limit=100):
            
            # --- APPLY SUBMISSION FILTERS ---
            # 1. Filter by whitelisted domain
            is_whitelisted = any(domain in submission.url for domain in WHITELISTED_DOMAINS)
            # 2. Filter by score
            is_high_score = submission.score >= MIN_SUBMISSION_SCORE

            if is_whitelisted and is_high_score:
                print(f"  -> Found relevant post: '{submission.title}' in r/{sub_name}")
                
                # Fetch comments
                submission.comments.replace_more(limit=0) # Get all top-level comments
                for comment in submission.comments:
                    # Avoid deleted comments or users
                    if comment.author is None or comment.body == '[deleted]':
                        continue

                    # --- APPLY COMMENT/USER FILTERS ---
                    account_age_days = (dt.datetime.now(dt.timezone.utc) - dt.datetime.fromtimestamp(comment.author.created_utc, tz=dt.timezone.utc)).days
                    
                    if comment.score >= MIN_COMMENT_SCORE and \
                       comment.author.comment_karma >= MIN_USER_KARMA and \
                       account_age_days >= MIN_ACCOUNT_AGE_DAYS:
                        
                        # Add the data to our list
                        scraped_data.append({
                            'company': company_name,
                            'ticker': ticker,
                            'subreddit': sub_name,
                            'submission_title': submission.title,
                            'submission_score': submission.score,
                            'submission_url': submission.url,
                            'comment_body': comment.body,
                            'comment_score': comment.score,
                            'comment_author_karma': comment.author.comment_karma
                        })


In [14]:
for search in query_dict.values():
    # Create a search query
    query = _mk_query(aliases=search)
    print(f"--- Searching for: {query} ---")
    # Loop through each target subreddit
    for sub_name in subreddits:
        subreddit = reddit.subreddit(sub_name)
        
        # Perform the search for submissions
        for submission in subreddit.search(query, sort='relevance', time_filter='all', limit=100):
            
            # --- APPLY SUBMISSION FILTERS ---
            
            # 1. Filter by score
            is_high_score = submission.score >= MIN_SUBMISSION_SCORE
            
            # 2. NEW FILTER: Check if it's a text post AND has text
            #    This replaces your 'is_whitelisted' filter.
            is_text_post = submission.is_self and submission.selftext and submission.selftext != '[deleted]'

            # Check our new filters
            if is_high_score and is_text_post:
                print(f"  -> Found relevant post: '{submission.title}' in r/{sub_name}")
                
                # --- NO COMMENT LOOP NEEDED ---
                # We are saving the submission body itself.
                
                # Add the data to our list
                scraped_data.append({
                    'submission_date': dt.datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d'),
                    'subreddit': sub_name,
                    'submission_title': submission.title,
                    'submission_score': submission.score,
                    'submission_url': submission.url, # This will be the URL to the reddit post
                    'submission_body': submission.selftext # This is the main text you wanted
                    # Note: Comment-specific fields are removed
                })
                

# ... (rest of your script) ...


--- Searching for: "ORCL" OR "Oracle" OR "Oracle Corp" ---
  -> Found relevant post: 'TikTok’s Algorithm to Be Secured by Oracle in Trump-Backed Deal' in r/stocks
  -> Found relevant post: 'Oracle Chosen as Winner in Deal for TikTok’s U.S. Operations' in r/stocks
  -> Found relevant post: 'Oracle Corp. (ORCL) Agrees to Buy Medical-Records Systems Provider Cerner Corp. (CERN) for $28.3 Billion All-Cash Deal' in r/stocks
  -> Found relevant post: 'The New Normal - 25% of the S&P 100 companies are >70 RSI signaling "overbought"' in r/stocks
  -> Found relevant post: 'Wall Street Week Ahead for the trading week beginning June 15th, 2020' in r/stocks
  -> Found relevant post: 'Oracle stock booms 30%, on pace for best day since 1999' in r/stocks
  -> Found relevant post: 'Get ready for the trading week beginning December 17th, 2018!' in r/stocks
  -> Found relevant post: '$ORCL down 34% from September highs - Is this the dip to buy or a value trap?' in r/stocks
  -> Found relevant post: 'Ora

In [None]:
scraped_data = pd.DataFrame(scraped_data)

66701

In [None]:
master_csv_path = '../data/reddit_news.csv'

print("\n--- Updating Master CSV File ---")

try:
    # 2. Load the existing master CSV if it exists
    master_df = pd.read_csv(master_csv_path)
    print(f"Loaded {len(master_df)} articles from '{master_csv_path}'")
    
    # 3. Combine the existing data with the newly fetched articles
    combined_df = pd.concat([master_df, month_data], ignore_index=True)
    
except FileNotFoundError:
    # If the master file doesn't exist, the new data is our starting point
    print(f"Master file not found. Creating a new one at '{master_csv_path}'")
    combined_df = month_data

# 4. Remove duplicate articles to keep the master file clean
# We use the 'url' as the unique identifier for an article.
# 'keep="last"' is good practice in case article details were ever updated.
updated_master_df = combined_df.drop_duplicates(subset=['url', 'title'], keep='last')

# 5. Save the final, de-duplicated data back to the master CSV
# 'index=False' prevents pandas from writing the DataFrame index as a column.
updated_master_df.to_csv(master_csv_path, index=False)

new_articles_count = len(updated_master_df) - (len(master_df) if 'master_df' in locals() else 0)
print(f"Added {new_articles_count} new unique articles.")
print(f"Master file successfully updated. Total articles: {len(updated_master_df)}")