In [3]:
import pandas as pd
import praw # Python Reddit API Wrapper
import re
import truststore

## Connect to Reddit API

In [4]:
# Configuration
CLIENT_ID = 'b3CUxOmuvB9QeRasUu3Hew'
CLIENT_SECRET = 'CqqnRJucVB500qgPSWTH9wHHx2acQQ'
USER_AGENT = 'script:my_reddit_script:v1.0 (by /u/py_dev684)'


truststore.inject_into_ssl()

# Initialize the Reddit client
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT
)

## Identify and Extract Reddit Data

### Extract comments

In [52]:
def fetch_comments_with_phrases(subreddits, patterns, comment_target=20):
    """
    Fetch comments that contain a target phrase, sourced from 'new' posts from specified subreddits. Comments are sourced from new posts in each subreddit.

    Parameters
    ----------
    subreddits: list
        A list of subreddits to search
    patterns: list
        A list of search terms (regex)
    comment_target: int
        Stop collecting comments once X comments have been found containing a term from patterns.

    Returns
    -------
    df: pd.DataFrame
        A DataFrame of collected comments.
    """

    data = []  # List to store data before converting to DataFrame

    for subreddit_name in subreddits:
        print(f"Fetching from r/{subreddit_name}...")
        subreddit = reddit.subreddit(subreddit_name)

        post_counter = 0
        comment_counter = 0
        while len(data) < comment_target:
            for submission in subreddit.new():
                post_counter += 1
                submission.comments.replace_more(limit=None)  # Replace "MoreComments" with actual comments
                # Filter comments based on presence of target phrases
                for comment in submission.comments.list():
                    comment_counter += 1
                    matched_pattern = next((pattern for pattern in patterns if re.search(pattern, comment.body, re.IGNORECASE)), None)
                    if matched_pattern:
                        data.append({
                            'Subreddit': subreddit_name,
                            'Post Title': submission.title,
                            'Comment Author': str(comment.author),
                            'Comment': comment.body,
                            'Matched Phrase': matched_pattern, 
                            'Upvotes': comment.score
                        })

                    if len(data) >= comment_target:  # If we've hit our comment target, break out
                        break
                print(f"Posts checked: {post_counter}")            
                print(f"Comments checked: {comment_counter}")
                print(f"Relevant comments: {len(data)}")
                
                if len(data) >= comment_target:  # If we've hit our comment target, stop processing
                    break
    print(f"----")
    print(f"Total posts checked: {post_counter}")
    print(f"Total comments checked: {comment_counter}")          
    print(f"Total comments collected: {len(data)}")

    # Convert the data list into a DataFrame
    df = pd.DataFrame(data)

    return df


In [53]:
subreddits = ['finance']
patterns = [r'\bHSBC\b', r'\bCiti\b', r'\bNatWest\b', r'\bCoutts\b', r'\bLloyds\b', r'\bBarclays\b', r'\bStandard\s+Chartered\b', r'\bSantander\b', r'\bBank\s+of\s+England\b', r'\bBoE\b', r'\bGoldman\s+Sachs\b', r'\bMorgan\s+Stanley\b', r'\bSilicon\s+Valley\s+Bank\b', r'\bSVB\b', r'\bCredit\s+Suisse\b', r'\bHalifax\b', r'/bInvestec/b', r'\bVirgin\s+Money\b']

comments = fetch_comments_with_phrases(subreddits, patterns, comment_target=10)

Fetching from r/finance...
Posts checked: 1
Comments checked: 0
Relevant comments: 0
Posts checked: 2
Comments checked: 21
Relevant comments: 0
Posts checked: 3
Comments checked: 36
Relevant comments: 0
Posts checked: 4
Comments checked: 47
Relevant comments: 0
Posts checked: 5
Comments checked: 156
Relevant comments: 0
Posts checked: 6
Comments checked: 193
Relevant comments: 2
Posts checked: 7
Comments checked: 196
Relevant comments: 2
Posts checked: 8
Comments checked: 209
Relevant comments: 2
Posts checked: 9
Comments checked: 209
Relevant comments: 2
Posts checked: 10
Comments checked: 258
Relevant comments: 2
Posts checked: 11
Comments checked: 337
Relevant comments: 8
Posts checked: 12
Comments checked: 348
Relevant comments: 8
Posts checked: 13
Comments checked: 373
Relevant comments: 8
Posts checked: 14
Comments checked: 470
Relevant comments: 8
Posts checked: 15
Comments checked: 517
Relevant comments: 8
Posts checked: 16
Comments checked: 558
Relevant comments: 8
Posts check

In [54]:
comments.head(10)

Unnamed: 0,Subreddit,Post Title,Comment Author,Comment,Matched Phrase,Upvotes
0,finance,Why Wall Street Is Suddenly Having an Everythi...,savagepanda,It’s odd how fed suddenly became dovish. They...,\bSVB\b,10
1,finance,Why Wall Street Is Suddenly Having an Everythi...,TokyoSxWhale,The BTFP expires March 11. If they don’t raise...,\bSVB\b,7
2,finance,Wall Street CEOs say proposed banking rules wi...,Omnipotent-Ape,"In your reply, you blame regulators for stifli...",\bSVB\b,11
3,finance,Wall Street CEOs say proposed banking rules wi...,Bocifer1,Right but just letting the industry regulate i...,\bSVB\b,5
4,finance,Wall Street CEOs say proposed banking rules wi...,Capadvantagetutoring,Let’s clear that up. SVB Fucked up. They didn’...,\bSVB\b,1
5,finance,Wall Street CEOs say proposed banking rules wi...,feelings_arent_facts,SVB literally collapsed because of a lack of c...,\bSVB\b,6
6,finance,Wall Street CEOs say proposed banking rules wi...,TaxGuy_021,"No?\n\nHistorically, bank runs happened due to...",\bSVB\b,1
7,finance,Wall Street CEOs say proposed banking rules wi...,Kevstuf,"From what I understand, SVB did not hedge thei...",\bSVB\b,0
8,finance,I'm a reporter at Bloomberg News. We just publ...,jigsaw_faust,I work for a credit union and it’s been crazy ...,\bSantander\b,22
9,finance,"Strip Clubs, Lewd Photos and a Boozy Hotel: Th...",OceanofChoco,"I agree mostly, I suppose it really depends on...",\bGoldman\s+Sachs\b,1


### Extract Posts

In [59]:
def fetch_posts_with_phrases(subreddits, patterns, post_target=20):
    """
    Fetch posts that contain a target phrase, sourced from 'new' posts from specified subreddits.

    Parameters
    ----------
    subreddits: list
        A list of subreddits to search
    patterns: list
        A list of search terms (regex)
    posts_target: int
        Stop collecting posts once X comments have been found containing a term from patterns.

    Returns
    -------
    df: pd.DataFrame
        A DataFrame of collected posts.
    """

    data = []  # List to store data before converting to DataFrame
    post_counter = 0
    
    while len(data) < post_target:
        
        for subreddit_name in subreddits:
            print(f"Fetching from r/{subreddit_name}...")
            subreddit = reddit.subreddit(subreddit_name)

            for submission in subreddit.new():
                post_counter += 1
                matched_pattern = next((pattern for pattern in patterns if re.search(pattern, submission.selftext, re.IGNORECASE)), None)
                
                if matched_pattern:
                    data.append({
                        'Subreddit': subreddit_name,
                        'Post Title': submission.title,
                        'Post Author': str(submission.author),
                        'Post': submission.selftext,
                        'Matched Phrase': matched_pattern, 
                        'Upvotes': submission.score
                    })

                print(f"Posts checked: {post_counter}")            
                print(f"Relevant posts: {len(data)}")
                
                if len(data) >= post_target:  # If we've hit our comment target, stop processing
                    break
    print(f"----")
    print(f"Total posts checked: {post_counter}")
    print(f"Total posts collected: {len(data)}")

    # Convert the data list into a DataFrame
    df = pd.DataFrame(data)

    return df



In [62]:
subreddits = ['finance', 'personalfinance', 'UKPersonalFinance']
patterns = [r'\bHSBC\b', r'\bCiti\b', r'\bNatWest\b', r'\bCoutts\b', r'\bLloyds\b', r'\bBarclays\b', r'\bStandard\s+Chartered\b', r'\bSantander\b', r'\bBank\s+of\s+England\b', r'\bBoE\b', r'\bGoldman\s+Sachs\b', r'\bMorgan\s+Stanley\b', r'\bSilicon\s+Valley\s+Bank\b', r'\bSVB\b', r'\bCredit\s+Suisse\b', r'\bHalifax\b', r'\bStandard\s+Chartered\b', r'/bInvestec/b', r'\bVirgin\s+Money\b']

posts = fetch_posts_with_phrases(subreddits, patterns, post_target=5)

Fetching from r/finance...
Posts checked: 1
Relevant posts: 0
Posts checked: 2
Relevant posts: 0
Posts checked: 3
Relevant posts: 0
Posts checked: 4
Relevant posts: 0
Posts checked: 5
Relevant posts: 0
Posts checked: 6
Relevant posts: 0
Posts checked: 7
Relevant posts: 0
Posts checked: 8
Relevant posts: 0
Posts checked: 9
Relevant posts: 0
Posts checked: 10
Relevant posts: 0
Posts checked: 11
Relevant posts: 1
Posts checked: 12
Relevant posts: 1
Posts checked: 13
Relevant posts: 1
Posts checked: 14
Relevant posts: 1
Posts checked: 15
Relevant posts: 1
Posts checked: 16
Relevant posts: 1
Posts checked: 17
Relevant posts: 1
Posts checked: 18
Relevant posts: 1
Posts checked: 19
Relevant posts: 1
Posts checked: 20
Relevant posts: 1
Posts checked: 21
Relevant posts: 1
Posts checked: 22
Relevant posts: 1
Posts checked: 23
Relevant posts: 1
Posts checked: 24
Relevant posts: 1
Posts checked: 25
Relevant posts: 1
Posts checked: 26
Relevant posts: 1
Posts checked: 27
Relevant posts: 1
Posts chec

In [63]:
posts.head(5)

Unnamed: 0,Subreddit,Post Title,Post Author,Post,Matched Phrase,Upvotes
0,finance,Wall Street CEOs say proposed banking rules wi...,ethereal3xp,Wall Street CEOs on Wednesday pushed back agai...,\bGoldman\s+Sachs\b,285
1,personalfinance,Question About Auto Refinance,iguana_lover420,Honestly I couldn't find a relevant subreddit ...,\bSantander\b,1
2,personalfinance,Getting Myself Out Of This Hole,traitornation,This group has helped me a lot and helped me o...,\bBarclays\b,1
3,UKPersonalFinance,Is there a better way to save my money?,KiloRGB,"Hi all,\n\nCurrently saving up to pay my car o...",\bLloyds\b,1
4,UKPersonalFinance,Made a mistake with a balance transfer credit ...,EntranceTiny6943,"Hi, \n\nI have recently taken on a new credit ...",\bHSBC\b,0


# Sentiment Analysis

## NLTK Vader

In [56]:
# Load sentiment analyser
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

# Initialise VADER
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jackwalker/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [57]:
# Apply VADER analysis on text column
df['sentiment_scores'] = df['Comment'].apply(lambda x: sia.polarity_scores(x))
df['compound'] = df['sentiment_scores'].apply(lambda score_dict: score_dict['compound'])
df['sentiment'] = df['compound'].apply(lambda c: 'POSITIVE' if c >= 0.05 else ('NEGATIVE' if c <= -0.05 else 'NEUTRAL'))
df.head(20)

Unnamed: 0,Subreddit,Post Title,Comment Author,Comment,Matched Phrase,Upvotes,sentiment_scores,compound,sentiment
0,finance,‘Almost All Loans Are Bad’—Why Banks Aren’t Le...,hcbaron,Extracted article:\n\nBanks would love to lend...,\bBarclays\b,13,"{'neg': 0.049, 'neu': 0.799, 'pos': 0.152, 'co...",0.9977,POSITIVE
1,finance,"Moronic Monday - September 05, 2023 - Your Wee...",14446368,Could we un-pin the SVB thing now?,\bSVB\b,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,NEUTRAL
2,finance,Bloomberg overhauls management team with Mark ...,marketrent,Per an internal memo sent by founder Mike Bloo...,\bBank\s+of\s+England\b,2,"{'neg': 0.021, 'neu': 0.941, 'pos': 0.037, 'co...",0.186,POSITIVE
3,finance,[Bloomberg] New York and California Each Lost ...,mzachi,here's some quotes from the article for those ...,\bGoldman\s+Sachs\b,34,"{'neg': 0.032, 'neu': 0.916, 'pos': 0.051, 'co...",0.6953,POSITIVE
4,finance,Is David Solomon Too Big a Jerk to Run Goldman...,MartianActual,Counterpoint: What other kind of person would ...,\bGoldman\s+Sachs\b,18,"{'neg': 0.179, 'neu': 0.821, 'pos': 0.0, 'comp...",-0.34,NEGATIVE
5,finance,"Moody's downgrades US banks, warns of possible...",bellayang1216,Based on the information provided:\r \n\r \n...,\bSilicon\s+Valley\s+Bank\b,2,"{'neg': 0.129, 'neu': 0.781, 'pos': 0.09, 'com...",-0.9558,NEGATIVE
6,finance,Why the US is interested in audits of Chinese ...,asuka_rice,"Plenty of lemons everywhere.\n\nCS, SVB and FT...",\bSVB\b,2,"{'neg': 0.129, 'neu': 0.72, 'pos': 0.151, 'com...",0.2249,POSITIVE
7,finance,‘Success fees’ and thirsty emails: inside a $9...,FishFar4370,Musk's desire to have an emotional tantrum and...,\bGoldman\s+Sachs\b,6,"{'neg': 0.038, 'neu': 0.829, 'pos': 0.134, 'co...",0.8981,POSITIVE
8,finance,‘Almost All Loans Are Bad’—Why Banks Aren’t Le...,hcbaron,Extracted article:\n\nBanks would love to lend...,\bBarclays\b,13,"{'neg': 0.049, 'neu': 0.799, 'pos': 0.152, 'co...",0.9977,POSITIVE
9,finance,"Moronic Monday - September 05, 2023 - Your Wee...",14446368,Could we un-pin the SVB thing now?,\bSVB\b,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,NEUTRAL


In [58]:
df.to_csv('reddit_comments_sentiment.csv', index=False)

## FinBERT

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

(…)kust/finbert-tone/resolve/main/vocab.txt: 100%|██████████| 226k/226k [00:00<00:00, 8.84MB/s]
