In [1]:
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import praw
from tqdm import tqdm 

In [2]:
user_agent = "Top topics in data science 1.0 by jonasge1992"
reddit = praw.Reddit(
    client_id="41NkDPnyuZyd9XyXLhKPvQ",
    client_secret="PHQtVGrLzYUno_hmrdp3eyv7sACmTg",
    user_agent=user_agent
)

In [3]:
# Function to fetch threads and comments from multiple subreddits with progress tracking and keyword filtering
def fetch_subreddit_data(subreddit_names, num_threads_per_subreddit, search_keywords):
    data = []
    
    # Initialize tqdm progress bar
    pbar = tqdm(total=len(subreddit_names) * num_threads_per_subreddit)
    
    for subreddit_name in subreddit_names:
        subreddit = reddit.subreddit(subreddit_name)
        threads = subreddit.hot(limit=num_threads_per_subreddit)  # Adjust as needed (top threads per subreddit)

        for thread in threads:
            # Skip live threads
            if 'live' in (thread.link_flair_text or '').lower() or 'live' in thread.title.lower():
                continue
            
            # Check if thread title or selftext contains any of the search keywords
            if any(keyword.lower() in thread.title.lower() or keyword.lower() in thread.selftext.lower() for keyword in search_keywords):
                thread.comments.replace_more(limit=None)  # Fetch all comments, including MoreComments

                for comment in thread.comments.list():
                    if isinstance(comment, praw.models.MoreComments):
                        continue  # Skip MoreComments objects

                    thread_data = {
                        'Subreddit': subreddit_name,
                        'Thread_Title': thread.title,
                        'Thread_Score': thread.score,
                        'Thread_URL': thread.url,
                        'Thread_Num_Comments': thread.num_comments,
                        'Thread_Flair': thread.link_flair_text if thread.link_flair_text else 'None',
                        'Thread_Selftext': thread.selftext,
                        'Comment_Body': comment.body,
                        'Comment_Score': comment.score,
                        'Comment_Author': comment.author.name if comment.author else '[deleted]'
                    }
                    data.append(thread_data)
                    pbar.update(1)  # Update progress bar
        
    pbar.close()  # Close progress bar after completion
    return data

# Example usage: Fetch data from multiple subreddits and filter by search keywords
subreddit_names = ['worldnews', 'Israel', 'Palestine', 'PoliticalDiscussion', 
                   'NeutralPolitics', 'MiddleEast', 'Geopolitics', 'ForeignPolicy']
num_threads_per_subreddit = 10  # Number of top threads to fetch per subreddit
search_keywords = ['Israel', 'Gaza', 'Palestine', 'Netanyahu', 'Hamas', 'Abbas', 'Bibi']  # Keywords to filter threads

subreddit_data = fetch_subreddit_data(subreddit_names, num_threads_per_subreddit, search_keywords)

# Convert to pandas DataFrame
df = pd.DataFrame(subreddit_data)

# Display the DataFrame (optional)
print(df.head())

1698it [00:25, 66.41it/s]                                                                                               

   Subreddit                              Thread_Title  Thread_Score  \
0  worldnews  US: Hamas formally rejected hostage deal          3371   
1  worldnews  US: Hamas formally rejected hostage deal          3371   
2  worldnews  US: Hamas formally rejected hostage deal          3371   
3  worldnews  US: Hamas formally rejected hostage deal          3371   
4  worldnews  US: Hamas formally rejected hostage deal          3371   

                                          Thread_URL  Thread_Num_Comments  \
0  https://www.jns.org/us-hamas-formally-rejected...                  523   
1  https://www.jns.org/us-hamas-formally-rejected...                  523   
2  https://www.jns.org/us-hamas-formally-rejected...                  523   
3  https://www.jns.org/us-hamas-formally-rejected...                  523   
4  https://www.jns.org/us-hamas-formally-rejected...                  523   

       Thread_Flair Thread_Selftext  \
0  Israel/Palestine                   
1  Israel/Palestine       




In [4]:
# Praw Documentation
# https://praw.readthedocs.io/en/stable/
# https://praw.readthedocs.io/en/latest/code_overview/models/submission.html

In [5]:
#Cleaning up of text
import string
import nltk
import re
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

def remove_links(text):
    # Define the regex pattern for URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    # Replace the URLs with an empty string
    return url_pattern.sub(r'', text)

#defining the function to remove punctuation
def clean_text(text):
    punctuation_free = "".join([i if i not in string.punctuation else ' ' for i in text])
    lowertext = punctuation_free.lower()
    text = lowertext.strip()
    text = re.sub(r'[^\w\s]',' ',text)
    text = ''.join([i for i in text if not i.isdigit()])
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)


    return text

def tokenization(text):
    tokens = re.split(' ',text)
    return tokens

#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer
#defining the object for stemming
porter_stemmer = PorterStemmer()

#defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
    

In [6]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [7]:
df["Comment_Body_Cleaned"] = df["Comment_Body"].apply(lambda x: remove_links(x))

In [8]:
df["Comment_Body_Cleaned"] = df["Comment_Body_Cleaned"].apply(lambda x: clean_text(x))

In [9]:
df["Comment_Body_Tokenized"] = df["Comment_Body_Cleaned"].apply(lambda x: tokenization(x))

In [10]:
df["Comment_Body_Tokenized"] = df["Comment_Body_Tokenized"].apply(lambda x: remove_stopwords(x))

In [11]:
df["Comment_Body_Tokenized"] = df["Comment_Body_Tokenized"].apply(lambda x: lemmatizer(x))

In [12]:
df["Comment_wo_links_stopwords"] = df["Comment_Body"].apply(lambda x: remove_links(x))
df["Comment_wo_links_stopwords"] = df["Comment_Body"].apply(lambda x: clean_text(x))

In [13]:
def join_tokens(words):
    sentence = ' '.join(words)
    return sentence
df["Comment_Body_Cleaned"] = df["Comment_Body_Tokenized"].apply(lambda x: join_tokens(x))

In [18]:
from transformers import pipeline 
import torch 

device = "cuda" if torch.cuda.is_available() else "cpu" 
classifier = pipeline("zero-shot-classification", 
                      model="facebook/bart-large-mnli", 
                      device=device) 

  _torch_pytree._register_pytree_node(
2024-06-26 18:20:18.027072: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [32]:
candidate_labels = ["Pro-Israel, Anti-Palestine","Pro-Palestine, Anti-Israel","Neutral between Palestine-Israel","Not relevant to the Israel-Palestine conflict"]

In [33]:
df["Combined"] = "Title of post: " + df["Thread_Title"] + ". Comment to be analyzed: " + df["Comment_Body"]

In [36]:
df["Combined"][3]

'Title of post: US: Hamas formally rejected hostage deal. Comment to be analyzed: Terrorists being terrorists….\n\nShocking'

In [35]:
classifier(df["Combined"][2], candidate_labels)

{'sequence': "Title of post: US: Hamas formally rejected hostage deal. Comment to be analyzed: It's almost like hamas was the problem all along.",
 'labels': ['Pro-Palestine, Anti-Israel',
  'Neutral between Palestine-Israel',
  'Pro-Israel, Anti-Palestine',
  'Not relevant to the Israel-Palestine conflict'],
 'scores': [0.5397629737854004,
  0.22978489100933075,
  0.19806399941444397,
  0.032388243824243546]}

In [37]:
newdf = pd.DataFrame()

In [38]:
newdf["Combined"] = df["Combined"]

In [40]:
results = newdf["Combined"].apply(lambda x: classifier(x, candidate_labels))

In [41]:
# Initialize lists to store data
sequences = []
best_labels = []

# Iterate through each item
for item in results:
    # Find the index of the label with the highest score
    best_index = max(range(len(item['labels'])), key=lambda i: item['scores'][i])
    
    # Append sequence and best label to lists
    sequences.append(item['sequence'])
    best_labels.append(item['labels'][best_index])

# Create a DataFrame
new_df = pd.DataFrame({
    'Sequence': sequences,
    'Best Label': best_labels
})

In [42]:
# Save DataFrame to CSV (optional)
new_df.to_csv('combined_text_zero_shot_classification.csv', index=False)

In [14]:
counts = pd.Series(''.join(df.Comment_Body_Cleaned).split()).value_counts()
counts

israel          692
hamas           405
people          318
would           295
like            249
               ... 
sudden            1
antisemites       1
cautiontrump      1
damaging          1
sc                1
Name: count, Length: 7964, dtype: int64

In [15]:
def reduce_sentence_length(sentence):
    if len(sentence)>512:
        sentence = sentence[:512]
    return sentence

In [16]:
df["Comment_Body_Cleaned_Shortened"] = df["Comment_Body_Cleaned"].apply(lambda x: reduce_sentence_length(x))

In [27]:
# Use a pipeline as a high-level helper
from transformers import pipeline

classifier = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
result = df["Comment_Body_Cleaned_Shortened"].apply(lambda x: classifier(x))

In [29]:
result

0       [{'label': 'neutral', 'score': 0.3826168775558...
1       [{'label': 'neutral', 'score': 0.3778409957885...
2       [{'label': 'neutral', 'score': 0.3529797792434...
3       [{'label': 'neutral', 'score': 0.3902023732662...
4       [{'label': 'neutral', 'score': 0.4040221571922...
                              ...                        
2084    [{'label': 'neutral', 'score': 0.3796155154705...
2085    [{'label': 'positive', 'score': 0.358612775802...
2086    [{'label': 'neutral', 'score': 0.3696939647197...
2087    [{'label': 'positive', 'score': 0.361505061388...
2088    [{'label': 'neutral', 'score': 0.3721854388713...
Name: Comment_Body_Cleaned_Shortened, Length: 2089, dtype: object

In [30]:
# Load model directly
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from scipy.special import softmax

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = TFAutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model.save_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

df["Comment_wo_links_stopwords_shortened"] = df["Comment_wo_links_stopwords"].apply(lambda x: reduce_sentence_length(x))
df["Comment_wo_links_stopwords_shortened_tokens"] = df["Comment_wo_links_stopwords_shortened"].apply(lambda x: tokenizer(x,return_tensors="tf"))
df["Output"] = df["Comment_wo_links_stopwords_shortened_tokens"].apply(lambda x: model(x))
df["Scores"] = df["Output"].apply(lambda x: x[0][0].numpy())
df["Scores"] = df["Scores"].apply(lambda x: softmax(x))

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# Function to get the label with the highest probability
def get_highest_label(scores):
    ranking = np.argsort(scores)[::-1]
    highest_label_index = ranking[0]
    highest_label = model.config.id2label[highest_label_index]
    return highest_label

# Apply the function to get the highest label for each row
df["Highest_Label"] = df["Scores"].apply(lambda x: get_highest_label(x))

In [32]:
df["Highest_Label"]

0       negative
1       negative
2       negative
3       negative
4       negative
          ...   
2084    negative
2085    negative
2086    positive
2087    positive
2088    positive
Name: Highest_Label, Length: 2089, dtype: object

In [33]:
# Unwrap 'label' from each list element into a new column
df['label'] = result.apply(lambda x: x[0]['label'])

In [34]:
df["score"] = result.apply(lambda x: x[0]['score'])

In [35]:
df

Unnamed: 0,Subreddit,Thread_Title,Thread_Score,Thread_URL,Thread_Num_Comments,Thread_Flair,Thread_Selftext,Comment_Body,Comment_Score,Comment_Author,...,Comment_Body_Tokenized,Comment_wo_links_stopwords,Comment_Body_Cleaned_Shortened,Comment_wo_links_stopwords_shortened,Comment_wo_links_stopwords_shortened_tokens,Output,Scores,Highest_Label,label,score
0,worldnews,US: Hamas formally rejected hostage deal,1588,https://www.jns.org/us-hamas-formally-rejected...,261,Israel/Palestine,,NO WAY!!! 🤯🤯🤯 /s,893,psychotimelord_,...,[way],no way s,way,no way s,"[input_ids, attention_mask]","{'logits': ((tf.Tensor(0.09184745, shape=(), d...","[0.36424628, 0.31076685, 0.32498685]",negative,neutral,0.382617
1,worldnews,US: Hamas formally rejected hostage deal,1588,https://www.jns.org/us-hamas-formally-rejected...,261,Israel/Palestine,,It's almost like hamas was the problem all along.,915,Mhdamas,...,"[almost, like, hamas, problem, along]",it s almost like hamas was the problem all along,almost like hamas problem along,it s almost like hamas was the problem all along,"[input_ids, attention_mask]","{'logits': ((tf.Tensor(0.14675869, shape=(), d...","[0.3819531, 0.3057211, 0.31232584]",negative,neutral,0.377841
2,worldnews,US: Hamas formally rejected hostage deal,1588,https://www.jns.org/us-hamas-formally-rejected...,261,Israel/Palestine,,Terrorists being terrorists….\n\nShocking,164,HisGibness,...,"[terrorist, terrorist, shocking]",terrorists being terrorists shocking,terrorist terrorist shocking,terrorists being terrorists shocking,"[input_ids, attention_mask]","{'logits': ((tf.Tensor(0.23327309, shape=(), d...","[0.40154582, 0.28407127, 0.31438294]",negative,neutral,0.352980
3,worldnews,US: Hamas formally rejected hostage deal,1588,https://www.jns.org/us-hamas-formally-rejected...,261,Israel/Palestine,,Israel will still get blamed.,405,icenoid,...,"[israel, still, get, blamed]",israel will still get blamed,israel still get blamed,israel will still get blamed,"[input_ids, attention_mask]","{'logits': ((tf.Tensor(0.23470455, shape=(), d...","[0.4303391, 0.28448442, 0.28517646]",negative,neutral,0.390202
4,worldnews,US: Hamas formally rejected hostage deal,1588,https://www.jns.org/us-hamas-formally-rejected...,261,Israel/Palestine,,We didn't expect this from these nice masked f...,66,tenonic,...,"[expect, nice, masked, fella]",we didn t expect this from these nice masked f...,expect nice masked fella,we didn t expect this from these nice masked f...,"[input_ids, attention_mask]","{'logits': ((tf.Tensor(0.12627333, shape=(), d...","[0.35429296, 0.31449333, 0.3312137]",negative,neutral,0.404022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2084,ForeignPolicy,U.S. warned Hezbollah it can't hold Israel bac...,3,https://www.axios.com/2024/06/25/us-warned-hez...,2,,,The US can’t hold Israel back irrespective.,3,Prior_Analytics,...,"[u, hold, israel, back, irrespective]",the us can t hold israel back irrespective,u hold israel back irrespective,the us can t hold israel back irrespective,"[input_ids, attention_mask]","{'logits': ((tf.Tensor(0.12399446, shape=(), d...","[0.3751612, 0.31800425, 0.30683458]",negative,neutral,0.379616
2085,ForeignPolicy,U.S. warned Hezbollah it can't hold Israel bac...,3,https://www.axios.com/2024/06/25/us-warned-hez...,2,,,Why do they think Hezbollah should be afraid o...,2,RandomAndCasual,...,"[think, hezbollah, afraid, holden, back, israel]",why do they think hezbollah should be afraid o...,think hezbollah afraid holden back israel,why do they think hezbollah should be afraid o...,"[input_ids, attention_mask]","{'logits': ((tf.Tensor(0.16235295, shape=(), d...","[0.38516906, 0.29857755, 0.31625345]",negative,positive,0.358613
2086,ForeignPolicy,Biggest US foreign policy/geopolitical wins in...,13,https://www.reddit.com/r/foreignpolicy/comment...,3,,"I don't mean to see only negatives, but it app...",Well the United States’ biggest win is probabl...,16,Njegos1789,...,"[well, united, state, biggest, win, probably, ...",well the united states biggest win is probably...,well united state biggest win probably incorpo...,well the united states biggest win is probably...,"[input_ids, attention_mask]","{'logits': ((tf.Tensor(-0.09282714, shape=(), ...","[0.323959, 0.31129536, 0.36474562]",positive,neutral,0.369694
2087,ForeignPolicy,Biggest US foreign policy/geopolitical wins in...,13,https://www.reddit.com/r/foreignpolicy/comment...,3,,"I don't mean to see only negatives, but it app...",PEPFAR,5,pm_me_ur_bidets,...,[pepfar],pepfar,pepfar,pepfar,"[input_ids, attention_mask]","{'logits': ((tf.Tensor(0.04491749, shape=(), d...","[0.34096968, 0.28091195, 0.37811837]",positive,positive,0.361505


In [36]:
# Save DataFrame to CSV (optional)
df.to_csv('filtered_subreddit_threads_comments.csv', index=False)