# News Data Processing: REDDIT

In [18]:
import pandas as pd
import numpy as np
import praw # Python Reddit API Wrapper
import json
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import datetime
from dotenv import load_dotenv
import os
import re

## Connect to Reddit API

In [13]:
load_dotenv()
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
user_agent = 'MLCapstoneProject by /u/Wise-Reward5805'
username =  'Wise-Reward5805'
password = os.getenv('PASSWORD')

In [14]:
def create_reddit_object(client_id, client_secret, user_agent, username, password):
    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent,
        username=username,
        password=password
    )
    return reddit

In [15]:
reddit = create_reddit_object(client_id, client_secret, user_agent, username, password)
print(reddit.user.me())  # Verify the authentication by printing the username


Wise-Reward5805


## Get Potential Universe

**Data Downloaded from WRDS (Up to December 31, 2022)**

#Use this script to webscrape changes to the SP500 and use it to update investment universe


url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#Selected_changes_to_the_list_of_S&P_500_components'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
req = Request(url, headers=headers)
html = urlopen(req)
soup = BeautifulSoup(html, 'html.parser')
table_list = soup.find_all('table')
table = table_list[1]  # The second list contains changes to the S&P consituents

table_headers = []
for th in table.find_all('th'):
    table_headers.append(th.text.strip())

rows = []
for tr in table.find_all('tr')[1:]:
    cells = [td.text.strip() for td in tr.find_all('td')]
    if cells:
        rows.append(cells)

spy_changes_df = pd.DataFrame(rows, columns=table_headers[:len(rows[0])])
spy_changes_df.columns = ['Effective Date', 'Added Ticker', 'Added Security', 'Removed Ticker', 'Removed Security', 'Reason']
spy_changes_df['Effective Date'] = pd.to_datetime(spy_changes_df['Effective Date'])
spy_changes_df.to_excel('../data/spy_changes.xlsx', index=False)

In [6]:
spy_constituents = pd.read_csv('../data/sp_500_constituents.csv')
spy_constituents.head()

Unnamed: 0,PERMNO,Company Name,Ticker,SP500 Start,SP500 End
0,10104,ORACLE CORP,ORCL,"Aug. 3, 1989","Dec. 31, 2024"
1,10107,MICROSOFT CORP,MSFT,"June 7, 1994","Dec. 31, 2024"
2,10138,T ROWE PRICE GROUP INC,TROW,"Oct. 13, 1999","Dec. 31, 2024"
3,10145,HONEYWELL INTERNATIONAL INC,HON,"Dec. 31, 1925","Dec. 31, 2024"
4,10516,ARCHER DANIELS MIDLAND CO,ADM,"July 30, 1981","Dec. 31, 2024"


In [None]:
def clean_company_name(name):
    """
    Removes common corporate suffixes and extra whitespace from a company name.
    """
    # List of suffixes to remove (case-insensitive)
    suffixes = [
        'CORPORATION', 'CORP', 'INCORPORATED', 'INC', 'COMPANY',
        'LIMITED', 'LTD', 'PLC', 'LLC', 'LP', 'INTERNATIONAL', 
        'HOLDINGS', 'CO', 'GROUP', 'TECHNOLOGIES', 'TECH', 
        'SYSTEMS', 'SOLUTIONS', 'SERVICES', 'INDUSTRIES', 
        'ENTERPRISES', 'GLOBAL', 'NETWORKS', 'SOFTWARE',
          'HOLDING', 'AG', 'S.A.', 'DEL', 'N V', '(The)',
          
    ]
    # Create a regex pattern to find any of these suffixes at the end of the string
    # The `\b` ensures we match whole words only.
    pattern = r'\b(' + '|'.join(suffixes) + r')\b\.?'
    
    # Remove the suffixes, ignoring case
    cleaned_name = re.sub(pattern, '', name, flags=re.IGNORECASE)
    
    # Remove any trailing commas or whitespace and return
    return cleaned_name.strip(' ,')


In [21]:
spy_constituents['Company Name'].iloc[0]

'ORACLE CORP'

In [23]:
names = spy_constituents['Company Name'].apply(lambda x: clean_company_name(x)).tolist()

In [24]:
names

['ORACLE',
 'MICROSOFT',
 'T ROWE PRICE',
 'HONEYWELL',
 'ARCHER DANIELS MIDLAND',
 'FISERV',
 'COCA COLA',
 'CADENCE DESIGN',
 'CONSOLIDATED EDISON',
 'DENTSPLY SIRONA',
 'FASTENAL',
 'D T E ENERGY',
 'EATON',
 'S V B FINANCIAL',
 'EXXON MOBIL',
 'M G M RESORTS',
 'WASTE MANAGEMENT  DEL',
 'GENERAL DYNAMICS',
 'GENERAL ELECTRIC',
 'LABORATORY  AMERICA HLDGS',
 'N X P SEMICONDUCTORS N V',
 'CHARTER COMMUNICATIONS  NEW',
 'LYONDELLBASELL  N V',
 'GENERAL MOTORS',
 'TRANE',
 'FIRST REPUBLIC BANK S F NEW',
 'FLEETCOR',
 'TARGA RESOURCES',
 'BUSINESS MACHS COR',
 'KINDER MORGAN',
 'H C A HEALTHCARE',
 'HUNTINGTON INGALLS INDS',
 'MARATHON PETROLEUM',
 'XYLEM',
 'APTIV',
 'EPAM',
 'ENPHASE ENERGY',
 'PHILLIPS 66',
 'META PLATFORMS',
 'SERVICENOW',
 'PENTAIR',
 'DIAMONDBACK ENERGY',
 'P G & E',
 'ABBVIE',
 'NORWEGIAN CRUISE LINE HLDGS',
 'ZOETIS',
 'PEPSICO',
 'ALTRIA',
 'IQVIA',
 'CONOCOPHILLIPS',
 'C D W  NEW',
 'NEWS  NEW',
 'NEWS  NEW',
 'AMGEN',
 'SCHLUMBERGER',
 'ALLEGION',
 'HILTON WO

In [7]:
all_permnos = spy_constituents['PERMNO'].tolist()
query_dict = {}
for index, row in spy_constituents.iterrows():
    key = row['PERMNO']
    values = [row['Ticker'], row['Company Name']]
    query_dict[key] = values

In [8]:
query_dict

{10104: ['ORCL', 'ORACLE CORP'],
 10107: ['MSFT', 'MICROSOFT CORP'],
 10138: ['TROW', 'T ROWE PRICE GROUP INC'],
 10145: ['HON', 'HONEYWELL INTERNATIONAL INC'],
 10516: ['ADM', 'ARCHER DANIELS MIDLAND CO'],
 10696: ['FISV', 'FISERV INC'],
 11308: ['KO', 'COCA COLA CO'],
 11403: ['CDNS', 'CADENCE DESIGN SYSTEMS INC'],
 11404: ['ED', 'CONSOLIDATED EDISON INC'],
 11600: ['XRAY', 'DENTSPLY SIRONA INC'],
 11618: ['FAST', 'FASTENAL CO'],
 11674: ['DTE', 'D T E ENERGY CO'],
 11762: ['ETN', 'EATON CORP PLC'],
 11786: ['SIVB', 'S V B FINANCIAL GROUP'],
 11850: ['XOM', 'EXXON MOBIL CORP'],
 11891: ['MGM', 'M G M RESORTS INTERNATIONAL'],
 11955: ['WM', 'WASTE MANAGEMENT INC DEL'],
 12052: ['GD', 'GENERAL DYNAMICS CORP'],
 12060: ['GE', 'GENERAL ELECTRIC CO'],
 12062: ['LH', 'LABORATORY CORP AMERICA HLDGS'],
 12084: ['NXPI', 'N X P SEMICONDUCTORS N V'],
 12308: ['CHTR', 'CHARTER COMMUNICATIONS INC NEW'],
 12345: ['LYB', 'LYONDELLBASELL INDUSTRIES N V'],
 12369: ['GM', 'GENERAL MOTORS CO'],
 12431:

## Query News from Subreddits

Subreddits of interest
* r/stocks
* r/investing
* r/wallstreetbets
* r/ValueInvesting
* r/StockMarket
* r/SecurityAnalysis
* r/finance

In [9]:
subreddits = ['stocks', 'investing', 'wallstreetbets', 'ValueInvesting', 'StockMarket', 'SecurityAnalysis', 'finance']
news_df = pd.DataFrame()
for subreddit in subreddits:
    
    subred = reddit.subreddit(subreddit)
    print(f"Pulling Data From Subreddit: {subreddit}")


Pulling Data From Subreddit: stocks
Pulling Data From Subreddit: investing
Pulling Data From Subreddit: wallstreetbets
Pulling Data From Subreddit: ValueInvesting
Pulling Data From Subreddit: StockMarket
Pulling Data From Subreddit: SecurityAnalysis
Pulling Data From Subreddit: finance


In [None]:
# --- 3. DEFINE FILTERING THRESHOLDS ---
MIN_SUBMISSION_SCORE = 20       # Minimum upvotes for a post to be considered
MIN_COMMENT_SCORE = 5           # Minimum upvotes for a comment to be considered
MIN_USER_KARMA = 100            # Minimum karma for a commenter's account
MIN_ACCOUNT_AGE_DAYS = 90       # Minimum age of a commenter's account in days

scraped_data = []

In [10]:
subred = reddit.subreddit('stocks')

In [None]:
 # Perform the search for submissions within the last 5 years
# Note: Reddit search is powerful but may not be perfectly exhaustive for long timeframes.
query = f"{query_dict[all_permnos[0]][0]} OR {query_dict[all_permnos[0]][1]}"
for submission in subreddit.search(query, sort='relevance', time_filter='all', limit=100):
    
    # --- APPLY SUBMISSION FILTERS ---
    # 1. Filter by whitelisted domain
    is_whitelisted = any(domain in submission.url for domain in WHITELISTED_DOMAINS)
    # 2. Filter by score
    is_high_score = submission.score >= MIN_SUBMISSION_SCORE

    if is_whitelisted and is_high_score:
        print(f"  -> Found relevant post: '{submission.title}' in r/{sub_name}")
        
        # Fetch comments
        submission.comments.replace_more(limit=0) # Get all top-level comments
        for comment in submission.comments:
            # Avoid deleted comments or users
            if comment.author is None or comment.body == '[deleted]':
                continue

            # --- APPLY COMMENT/USER FILTERS ---
            account_age_days = (dt.datetime.now(dt.timezone.utc) - dt.datetime.fromtimestamp(comment.author.created_utc, tz=dt.timezone.utc)).days
            
            if comment.score >= MIN_COMMENT_SCORE and \
                comment.author.comment_karma >= MIN_USER_KARMA and \
                account_age_days >= MIN_ACCOUNT_AGE_DAYS:
                
                # Add the data to our list
                scraped_data.append({
                    'company': company_name,
                    'ticker': ticker,
                    'subreddit': sub_name,
                    'submission_title': submission.title,
                    'submission_score': submission.score,
                    'submission_url': submission.url,
                    'comment_body': comment.body,
                    'comment_score': comment.score,
                    'comment_author_karma': comment.author.comment_karma
                })


In [None]:
for company_name, ticker in COMPANIES.items():
    # Create a search query
    query = f'"{company_name}" OR "{ticker}"'
    print(f"--- Searching for: {query} ---")

    # Loop through each target subreddit
    for sub_name in TARGET_SUBREDDITS:
        subreddit = reddit.subreddit(sub_name)
        
        # Perform the search for submissions within the last 5 years
        # Note: Reddit search is powerful but may not be perfectly exhaustive for long timeframes.
        for submission in subreddit.search(query, sort='relevance', time_filter='all', limit=100):
            
            # --- APPLY SUBMISSION FILTERS ---
            # 1. Filter by whitelisted domain
            is_whitelisted = any(domain in submission.url for domain in WHITELISTED_DOMAINS)
            # 2. Filter by score
            is_high_score = submission.score >= MIN_SUBMISSION_SCORE

            if is_whitelisted and is_high_score:
                print(f"  -> Found relevant post: '{submission.title}' in r/{sub_name}")
                
                # Fetch comments
                submission.comments.replace_more(limit=0) # Get all top-level comments
                for comment in submission.comments:
                    # Avoid deleted comments or users
                    if comment.author is None or comment.body == '[deleted]':
                        continue

                    # --- APPLY COMMENT/USER FILTERS ---
                    account_age_days = (dt.datetime.now(dt.timezone.utc) - dt.datetime.fromtimestamp(comment.author.created_utc, tz=dt.timezone.utc)).days
                    
                    if comment.score >= MIN_COMMENT_SCORE and \
                       comment.author.comment_karma >= MIN_USER_KARMA and \
                       account_age_days >= MIN_ACCOUNT_AGE_DAYS:
                        
                        # Add the data to our list
                        scraped_data.append({
                            'company': company_name,
                            'ticker': ticker,
                            'subreddit': sub_name,
                            'submission_title': submission.title,
                            'submission_score': submission.score,
                            'submission_url': submission.url,
                            'comment_body': comment.body,
                            'comment_score': comment.score,
                            'comment_author_karma': comment.author.comment_karma
                        })
