In [18]:
import pandas as pd
import praw # Python Reddit API Wrapper

In [19]:
# Configuration
CLIENT_ID = 'b3CUxOmuvB9QeRasUu3Hew'
CLIENT_SECRET = 'CqqnRJucVB500qgPSWTH9wHHx2acQQ'
USER_AGENT = 'script:my_reddit_script:v1.0 (by /u/py_dev684)'

# Initialize the Reddit client
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT
)

In [23]:
def fetch_comments_with_phrases(subreddits, phrases, comment_target=20):
    """Fetch comments that contain a target phrase, sourced from 'hot' posts from the specified subreddits. Comments are sourced from hot posts in each subreddit. The hot sorting method is a combination of factors that Reddit uses, including the age of the post and its upvote count. It's meant to show posts that are currently popular and relevant."""

    data = []  # List to store data before converting to DataFrame

    for subreddit_name in subreddits:
        print(f"Fetching from r/{subreddit_name}...")
        subreddit = reddit.subreddit(subreddit_name)
        
        for post in subreddit.new(limit=None):  # Iterating through the newest posts
            if len(data) >= comment_target:  # If we've hit our comment target, stop.
                break

            post.comments.replace_more(limit=None)  # Replace "MoreComments" with actual comments
            
            # Filter comments based on presence of target phrases
            matching_comments = [comment for comment in post.comments if any(phrase.lower() in comment.body.lower() for phrase in phrases)]
            
            for comment in matching_comments:
                data.append({
                    'Subreddit': subreddit_name,
                    'Post Title': post.title,
                    'Comment Author': str(comment.author),
                    'Comment': comment.body,
                    'Upvotes': comment.score
                })

                if len(data) >= comment_target:  # If we've hit our comment target, stop.
                    break

    print("Comments collected.")

    # Convert the data list into a DataFrame
    df = pd.DataFrame(data)

    return df

In [36]:
def fetch_comments_with_phrases(subreddits, phrases, comment_target=20):
    """Fetch comments that contain a target phrase, sourced from 'new' posts from the specified subreddits. Comments are sourced from new posts in each subreddit. The new sorting method gives posts in the order they were posted."""

    data = []  # List to store data before converting to DataFrame

    for subreddit_name in subreddits:
        print(f"Fetching from r/{subreddit_name}...")
        subreddit = reddit.subreddit(subreddit_name)

        post_counter = 0
        comment_counter = 0
        while len(data) < comment_target:
            for post in subreddit.new(limit=100):  # We'll retrieve posts in chunks of 100
                post.comments.replace_more(limit=None)  # Replace "MoreComments" with actual comments
                
                # Filter comments based on presence of target phrases
                for comment in post.comments:
                    comment_counter += 1
                    if any(phrase.lower() in comment.body.lower() for phrase in phrases):
                        data.append({
                            'Subreddit': subreddit_name,
                            'Post Title': post.title,
                            'Comment Author': str(comment.author),
                            'Comment': comment.body,
                            'Upvotes': comment.score
                        })

                    if len(data) >= comment_target:  # If we've hit our comment target, break out
                        break

                print(f"Comments checked: {comment_counter}")
                print(f"Relevant comments: {len(data)}")
                
                if len(data) >= comment_target:  # If we've hit our comment target, stop processing
                    break

    print(f"{len(data)} comments collected.")

    # Convert the data list into a DataFrame
    df = pd.DataFrame(data)

    return df


In [38]:
subreddits = ['finance']
target_phrases = ['hsbc', 'HSBC', 'citi', 'Citi', 'natwest', 'NatWest', 'Lloyds', 'lloyds', 'Bank of England', 'BoE']

df = fetch_comments_with_phrases(subreddits, target_phrases)

Fetching from r/finance...
Comments checked: 0
Relevant comments: 0
Comments checked: 31
Relevant comments: 1
Comments checked: 40
Relevant comments: 1
Comments checked: 43
Relevant comments: 1
Comments checked: 60
Relevant comments: 1
Comments checked: 62
Relevant comments: 1
Comments checked: 64
Relevant comments: 1
Comments checked: 68
Relevant comments: 1
Comments checked: 81
Relevant comments: 1
Comments checked: 87
Relevant comments: 1
Comments checked: 147
Relevant comments: 1
Comments checked: 152
Relevant comments: 1
Comments checked: 163
Relevant comments: 1
Comments checked: 169
Relevant comments: 1
Comments checked: 174
Relevant comments: 1
Comments checked: 190
Relevant comments: 1
Comments checked: 191
Relevant comments: 1
Comments checked: 197
Relevant comments: 1
Comments checked: 199
Relevant comments: 2
Comments checked: 210
Relevant comments: 2
Comments checked: 213
Relevant comments: 2
Comments checked: 218
Relevant comments: 2
Comments checked: 234
Relevant comment

In [39]:
df.head(20)

Unnamed: 0,Subreddit,Post Title,Comment Author,Comment,Upvotes
0,finance,Powell says inflation is still too high and lo...,esp211,Meanwhile corporations are price gauging consu...,38
1,finance,‘Almost All Loans Are Bad’—Why Banks Aren’t Le...,hcbaron,Extracted article:\n\nBanks would love to lend...,12
2,finance,Secret paper trail reveals hidden Adani invest...,issac_hunt1,The Adani scam is propped up directly by Modi....,61
3,finance,‘It’s a trap!’ The Great Euro Conspiracy Theory,DragonStreamline,With the right regulation cryptocurrency is th...,-2
4,finance,Bloomberg overhauls management team with Mark ...,marketrent,Per an internal memo sent by founder Mike Bloo...,2
5,finance,[Bloomberg] New York and California Each Lost ...,Ambiti0nZ-,While Dallas & Salt Lake City are developing n...,19
6,finance,China cuts key interest rate as retail sales a...,rickrich01,So happy to see the China's policies are a tot...,11
7,finance,China’s property crisis deepens as another hug...,marketrent,"Country Garden, the biggest privately owned de...",10
8,finance,"Moody's downgrades US banks, warns of possible...",bellayang1216,Based on the information provided:\r \n\r \n...,2
9,finance,Fitch downgrades US long-term credit rating to...,AltoidStrong,\n\n\nThis pattern has happened to many tim...,20


# Sentiment Analysis

In [40]:
# Load sentiment analyser
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

# Initialise VADER
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jackwalker/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [41]:
# Apply VADER analysis on text column
df['sentiment_scores'] = df['Comment'].apply(lambda x: sia.polarity_scores(x))
df['compound'] = df['sentiment_scores'].apply(lambda score_dict: score_dict['compound'])
df['sentiment'] = df['compound'].apply(lambda c: 'POSITIVE' if c >= 0.05 else ('NEGATIVE' if c <= -0.05 else 'NEUTRAL'))
df.head(20)

Unnamed: 0,Subreddit,Post Title,Comment Author,Comment,Upvotes,sentiment_scores,compound,sentiment
0,finance,Powell says inflation is still too high and lo...,esp211,Meanwhile corporations are price gauging consu...,38,"{'neg': 0.271, 'neu': 0.658, 'pos': 0.071, 'co...",-0.7543,NEGATIVE
1,finance,‘Almost All Loans Are Bad’—Why Banks Aren’t Le...,hcbaron,Extracted article:\n\nBanks would love to lend...,12,"{'neg': 0.049, 'neu': 0.799, 'pos': 0.152, 'co...",0.9977,POSITIVE
2,finance,Secret paper trail reveals hidden Adani invest...,issac_hunt1,The Adani scam is propped up directly by Modi....,61,"{'neg': 0.188, 'neu': 0.742, 'pos': 0.069, 'co...",-0.987,NEGATIVE
3,finance,‘It’s a trap!’ The Great Euro Conspiracy Theory,DragonStreamline,With the right regulation cryptocurrency is th...,-2,"{'neg': 0.113, 'neu': 0.799, 'pos': 0.088, 'co...",-0.8208,NEGATIVE
4,finance,Bloomberg overhauls management team with Mark ...,marketrent,Per an internal memo sent by founder Mike Bloo...,2,"{'neg': 0.021, 'neu': 0.941, 'pos': 0.037, 'co...",0.186,POSITIVE
5,finance,[Bloomberg] New York and California Each Lost ...,Ambiti0nZ-,While Dallas & Salt Lake City are developing n...,19,"{'neg': 0.063, 'neu': 0.841, 'pos': 0.096, 'co...",0.6956,POSITIVE
6,finance,China cuts key interest rate as retail sales a...,rickrich01,So happy to see the China's policies are a tot...,11,"{'neg': 0.292, 'neu': 0.614, 'pos': 0.094, 'co...",-0.8121,NEGATIVE
7,finance,China’s property crisis deepens as another hug...,marketrent,"Country Garden, the biggest privately owned de...",10,"{'neg': 0.086, 'neu': 0.859, 'pos': 0.055, 'co...",-0.5267,NEGATIVE
8,finance,"Moody's downgrades US banks, warns of possible...",bellayang1216,Based on the information provided:\r \n\r \n...,2,"{'neg': 0.129, 'neu': 0.781, 'pos': 0.09, 'com...",-0.9558,NEGATIVE
9,finance,Fitch downgrades US long-term credit rating to...,AltoidStrong,\n\n\nThis pattern has happened to many tim...,20,"{'neg': 0.15, 'neu': 0.705, 'pos': 0.146, 'com...",0.5792,POSITIVE


In [17]:
df.to_csv('reddit_comments_sentiment.csv', index=False)