In [1]:
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import praw
from tqdm import tqdm 

In [2]:
user_agent = "Top topics in data science 1.0 by jonasge1992"
reddit = praw.Reddit(
    client_id="41NkDPnyuZyd9XyXLhKPvQ",
    client_secret="PHQtVGrLzYUno_hmrdp3eyv7sACmTg",
    user_agent=user_agent
)

In [3]:
# Function to fetch threads and comments from multiple subreddits with progress tracking and keyword filtering
def fetch_subreddit_data(subreddit_names, num_threads_per_subreddit, search_keywords):
    data = []
    
    # Initialize tqdm progress bar
    pbar = tqdm(total=len(subreddit_names) * num_threads_per_subreddit)
    
    for subreddit_name in subreddit_names:
        subreddit = reddit.subreddit(subreddit_name)
        threads = subreddit.hot(limit=num_threads_per_subreddit)  # Adjust as needed (top threads per subreddit)

        for thread in threads:
            # Skip live threads
            if 'live' in (thread.link_flair_text or '').lower() or 'live' in thread.title.lower():
                continue
            
            # Check if thread title or selftext contains any of the search keywords
            if any(keyword.lower() in thread.title.lower() or keyword.lower() in thread.selftext.lower() for keyword in search_keywords):
                thread.comments.replace_more(limit=None)  # Fetch all comments, including MoreComments

                for comment in thread.comments.list():
                    if isinstance(comment, praw.models.MoreComments):
                        continue  # Skip MoreComments objects

                    thread_data = {
                        'Subreddit': subreddit_name,
                        'Thread_Title': thread.title,
                        'Thread_Score': thread.score,
                        'Thread_URL': thread.url,
                        'Thread_Num_Comments': thread.num_comments,
                        'Thread_Flair': thread.link_flair_text if thread.link_flair_text else 'None',
                        'Thread_Selftext': thread.selftext,
                        'Comment_Body': comment.body,
                        'Comment_Score': comment.score,
                        'Comment_Author': comment.author.name if comment.author else '[deleted]'
                    }
                    data.append(thread_data)
                    pbar.update(1)  # Update progress bar
        
    pbar.close()  # Close progress bar after completion
    return data

# Example usage: Fetch data from multiple subreddits and filter by search keywords
subreddit_names = ['worldnews', 'Israel', 'Palestine', 'PoliticalDiscussion', 
                   'NeutralPolitics', 'MiddleEast', 'Geopolitics', 'ForeignPolicy']
num_threads_per_subreddit = 10  # Number of top threads to fetch per subreddit
search_keywords = ['Israel', 'Gaza', 'Palestine', 'Netanyahu', 'Hamas', 'Abbas', 'Bibi']  # Keywords to filter threads

subreddit_data = fetch_subreddit_data(subreddit_names, num_threads_per_subreddit, search_keywords)

# Convert to pandas DataFrame
df = pd.DataFrame(subreddit_data)

# Display the DataFrame (optional)
print(df.head())

2046it [00:26, 77.65it/s]                                                                                               

   Subreddit                              Thread_Title  Thread_Score  \
0  worldnews  US: Hamas formally rejected hostage deal          1383   
1  worldnews  US: Hamas formally rejected hostage deal          1383   
2  worldnews  US: Hamas formally rejected hostage deal          1383   
3  worldnews  US: Hamas formally rejected hostage deal          1383   
4  worldnews  US: Hamas formally rejected hostage deal          1383   

                                          Thread_URL  Thread_Num_Comments  \
0  https://www.jns.org/us-hamas-formally-rejected...                  230   
1  https://www.jns.org/us-hamas-formally-rejected...                  230   
2  https://www.jns.org/us-hamas-formally-rejected...                  230   
3  https://www.jns.org/us-hamas-formally-rejected...                  230   
4  https://www.jns.org/us-hamas-formally-rejected...                  230   

       Thread_Flair Thread_Selftext  \
0  Israel/Palestine                   
1  Israel/Palestine       




In [4]:
# Praw Documentation
# https://praw.readthedocs.io/en/stable/
# https://praw.readthedocs.io/en/latest/code_overview/models/submission.html

In [5]:
#Cleaning up of text
import string
import nltk
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

def remove_links(text):
    # Define the regex pattern for URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    # Replace the URLs with an empty string
    return url_pattern.sub(r'', text)

#defining the function to remove punctuation
def clean_text(text):
    punctuation_free = "".join([i if i not in string.punctuation else ' ' for i in text])
    lowertext = punctuation_free.lower()
    text = lowertext.strip()
    text = re.sub(r'[^\w\s]',' ',text)
    text = ''.join([i for i in text if not i.isdigit()])
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)


    return text

def tokenization(text):
    tokens = re.split(' ',text)
    return tokens

#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer
#defining the object for stemming
porter_stemmer = PorterStemmer()

#defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
    

In [6]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [7]:
df["Comment_Body_Cleaned"] = df["Comment_Body"].apply(lambda x: remove_links(x))

In [8]:
df["Comment_Body_Cleaned"] = df["Comment_Body_Cleaned"].apply(lambda x: clean_text(x))

In [9]:
df["Comment_Body_Tokenized"] = df["Comment_Body_Cleaned"].apply(lambda x: tokenization(x))

In [10]:
df["Comment_Body_Tokenized"] = df["Comment_Body_Tokenized"].apply(lambda x: remove_stopwords(x))

In [11]:
df["Comment_Body_Tokenized"] = df["Comment_Body_Tokenized"].apply(lambda x: lemmatizer(x))

In [12]:
df["Comment_wo_links_stopwords"] = df["Comment_Body"].apply(lambda x: remove_links(x))
df["Comment_wo_links_stopwords"] = df["Comment_Body"].apply(lambda x: clean_text(x))

In [13]:
def join_tokens(words):
    sentence = ' '.join(words)
    return sentence
df["Comment_Body_Cleaned"] = df["Comment_Body_Tokenized"].apply(lambda x: join_tokens(x))

In [14]:
counts = pd.Series(''.join(df.Comment_Body_Cleaned).split()).value_counts()
counts

israel            683
hamas             399
people            314
would             295
like              245
                 ... 
alreadyexactly      1
irlanyone           1
tale                1
handmaiden          1
sc                  1
Name: count, Length: 7867, dtype: int64

In [15]:
def reduce_sentence_length(sentence):
    if len(sentence)>512:
        sentence = sentence[:512]
    return sentence

In [16]:
df["Comment_Body_Cleaned_Shortened"] = df["Comment_Body_Cleaned"].apply(lambda x: reduce_sentence_length(x))

In [19]:
# Use a pipeline as a high-level helper
from transformers import pipeline

classifier = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


OSError: Can't load tokenizer for 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis' is the correct path to a directory containing all relevant files for a RobertaTokenizerFast tokenizer.

In [None]:
result = df["Comment_Body_Cleaned_Shortened"].apply(lambda x: classifier(x))

In [None]:
result

In [None]:
# Load model directly
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from scipy.special import softmax

tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = TFAutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model.save_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

df["Comment_wo_links_stopwords_shortened"] = df["Comment_wo_links_stopwords"].apply(lambda x: reduce_sentence_length(x))
df["Comment_wo_links_stopwords_shortened_tokens"] = df["Comment_wo_links_stopwords_shortened"].apply(lambda x: tokenizer(x,return_tensors="tf"))
df["Output"] = df["Comment_wo_links_stopwords_shortened_tokens"].apply(lambda x: model(x))
df["Scores"] = df["Output"].apply(lambda x: x[0][0].numpy())
df["Scores"] = df["Scores"].apply(lambda x: softmax(x))

In [None]:
# Function to get the label with the highest probability
def get_highest_label(scores):
    ranking = np.argsort(scores)[::-1]
    highest_label_index = ranking[0]
    highest_label = model.config.id2label[highest_label_index]
    return highest_label

# Apply the function to get the highest label for each row
df["Highest_Label"] = df["Scores"].apply(lambda x: get_highest_label(x))

In [None]:
df["Highest_Label"]

In [None]:
# Unwrap 'label' from each list element into a new column
df['label'] = result.apply(lambda x: x[0]['label'])

In [None]:
df["score"] = result.apply(lambda x: x[0]['score'])

In [None]:
df

In [None]:
# Save DataFrame to CSV (optional)
df.to_csv('filtered_subreddit_threads_comments.csv', index=False)