### **import data and initalize praw**

In [168]:
import praw
import pandas as pd
from typing import List, Dict
import re 
import json
from tqdm import tqdm
import time

def load_reddit_credentials(file_path):
    """Load Reddit API credentials from file."""
    with open(file_path, 'r') as file:
        content = file.read().strip()
        if not content:
            raise ValueError("File is empty")
        credentials = json.loads(content)
        return credentials


def initialize_reddit(client_id: str, client_secret: str, user_agent: str) -> praw.Reddit:
    """Initialize and return Reddit API instance."""
    return praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent
    )

file_path = '/Users/andre/Documents/Python_local/sentiment_analyses/reddit_creds.json'
credentials = load_reddit_credentials(file_path)


# Access the credentials
CLIENT_ID = credentials['CLIENT_ID']
CLIENT_SECRET = credentials['CLIENT_SECRET']
USER_AGENT = credentials['USER_AGENT']

# Initialize Reddit API
reddit = initialize_reddit(CLIENT_ID, CLIENT_SECRET, USER_AGENT)

### **Phase 1: Data Collection**

**Objective:**
retrive data from Reddit using PRAW library

Key Questions: 
how to use praw and what are the data formarts i get ? 
how to create batches to respect rate limits?  
how do i structure the search for valuable data retrival 


In [240]:
# Function to search for subreddits and retrieve attributes
def search_subreddits(keyword, limit=100):
    subreddits = reddit.subreddits.search(keyword, limit=limit)
    subreddit_info = []
    for subreddit in subreddits:
        info = {
            'display_name': subreddit.display_name,
            'title': subreddit.title,
            'description': subreddit.public_description,
            'subscribers': subreddit.subscribers,
            # 'active_user_count': subreddit.active_user_count,
            # 'nsfw': subreddit.over18,
            'submission_type': subreddit.submission_type,
        }
        subreddit_info.append(info)
    return subreddit_info

# Example usage
keyword = 'sol solana'
subreddits = search_subreddits(keyword)


In [243]:
subreddits_df = pd.DataFrame(subreddits)
# df_subreddits.sort_values(by='subscribers',ascending=False).head(10)
subreddits_df
solana = ['solana','SolanaMemeCoins']
eth = ['ethereum','ethtrader','ethfinance','ethermining','ethstaker']

Unnamed: 0,display_name,title,description,subscribers,submission_type
0,solana,Solana,Welcome to the official Solana subreddit. Thi...,348527.0,any
1,SolanaMemeCoins,SolanaMemeCoins,The #1 Solana Memecoin subreddit. \n ...,50007.0,any
2,CryptoCurrency,Cryptocurrency News & Discussion,"The leading community for cryptocurrency news,...",9406166.0,any
3,SolanaNFT,Solana NFTs,NFTs on Solana.,22980.0,any
4,Solana_Memes,Solana_Memes,This is a community about Solana meme coins an...,1958.0,any
...,...,...,...,...,...
80,DogeOnSOL,DogeOnSOL,"Dive into the vibrant world of DogeOnSOL, the ...",24.0,any
81,SolGalaxy,SolGalaxy,Welcome to the SolGalaxy!\nA collection of uni...,22.0,any
82,SanctumSolana,SanctumSolana,"Hey there, curious cloudmen! Welcome to Sanctu...",154.0,any
83,BabaTokenSol,BabaTokenSol,"Meet BABA, the neglected brother of PEPE. BABA...",28.0,any


In [170]:
# get the posts from the subreddit
def get_posts(reddit: praw.Reddit, subreddits: List[str], method_limits: Dict[str, int], top_time_filter: str = 'month') -> pd.DataFrame:
    """
    Fetch posts from specified subreddits using different sorting methods.
    
    Args:
        reddit: Reddit API instance
        subreddits: List of subreddit names
        method_limits: Dictionary of method names and their post limits
        top_time_filter: Time filter for top posts ('hour', 'day', 'week', 'month', 'year', 'all')
    """
    all_posts = []
    
    for subreddit_name in tqdm(subreddits, desc='Subreddits'):
        subreddit = reddit.subreddit(subreddit_name)
        
        for method, limit in method_limits.items():
            # Handle 'top' posts separately due to time_filter parameter
            if method == 'top':
                submissions = subreddit.top(limit=limit, time_filter=top_time_filter)
            else:
                submissions = getattr(subreddit, method)(limit=limit)
            
            # Extract post data
            for submission in submissions:
                all_posts.append({
                    'id': submission.id,
                    'title': submission.title,
                    'score': submission.score,
                    'num_comments': submission.num_comments,
                    'created_utc': submission.created_utc,
                    'text': submission.selftext,
                    'subreddit': submission.subreddit.display_name,
                    'method': f"{method}_{top_time_filter}" if method == 'top' else method
                })
    
    return pd.DataFrame(all_posts)



# general subreddits + coin specific subreddits
list_of_subreddits = [
    #solana
    'solana','SolanaMemeCoins',
    #eth 
    'ethereum','ethtrader','ethfinance','ethermining','ethstaker',
    #bitcoin
    'Bitcoin', 'BitcoinBeginners', 'btc', 'BitcoinMarkets',
    #general crypto info 
    'CryptoCurrency',
 'Superstonk',
 'Crypto_General',
 'Crypto_Currency_News',
 'CryptocurrencyICO',
 'SatoshiStreetBets',
 'CryptoTradingFloor',
 'crypto',
 'CryptoCurrencies',
 'CryptoCurrencyClassic',
 'CryptoExchange',
 'CryptoNews',
 'CryptoMarkets',
 'crypto_currency']

# Define limits for each method
all_limit = 3

method_limits = {
    'hot': all_limit,
    'new': all_limit,
    'controversial': all_limit,
    'rising': all_limit,
    'top': all_limit
}

# Define time filter for top posts
filter_top_method = 'month'  # Options: 'hour', 'day', 'week', 'month', 'year', 'all'

# Get posts and create DataFrame
df = get_posts(reddit, list_of_subreddits, method_limits=method_limits, top_time_filter=filter_top_method)

df 

Subreddits: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [00:26<00:00,  1.25s/it]


Unnamed: 0,id,title,score,num_comments,created_utc,text,subreddit,method
0,1i0zmqs,"Daily Discussion, January 14, 2025",5,10,1.736835e+09,Please utilize this sticky thread for all gene...,Bitcoin,hot
1,1f7n947,Bitcoin Newcomers FAQ - Please read!,119,29,1.725326e+09,# Welcome to the /r/Bitcoin Newcomers FAQ\n\nY...,Bitcoin,hot
2,1i0pj98,JUST IN: The largest Italian üáÆüáπ bank by total ...,1490,89,1.736805e+09,,Bitcoin,hot
3,1i10azr,Can anyone share some experience with strike,4,2,1.736838e+09,I‚Äôm based in Australia and I currently use kra...,Bitcoin,new
4,1i0zvtj,Sparrow connects randomly,1,1,1.736836e+09,"Hi, I setup a node (Linux) and when I try to c...",Bitcoin,new
...,...,...,...,...,...,...,...,...
307,1hzndcm,Techlead: How Much Bitcoin is Enough?,0,0,1.736691e+09,,crypto_currency,rising
308,1htcir7,5th Largest Bank in Turkey to Launch Crypto Tr...,1,0,1.735990e+09,,crypto_currency,rising
309,1i0w7m1,SEC Gary Gensler Takes a Huge Loss in Coinbase...,1,0,1.736823e+09,,crypto_currency,top_month
310,1htcir7,5th Largest Bank in Turkey to Launch Crypto Tr...,1,0,1.735990e+09,,crypto_currency,top_month


In [171]:
df1 = df.copy()
df1.drop_duplicates(subset='id',inplace=True)
# df1['created'] = pd.to_datetime(df['created_utc'],unit='s')
# df1.index = df1['created']
# df1.drop(columns=['created_utc','created'],inplace=True)
df1.head()
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 224 entries, 0 to 311
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            224 non-null    object 
 1   title         224 non-null    object 
 2   score         224 non-null    int64  
 3   num_comments  224 non-null    int64  
 4   created_utc   224 non-null    float64
 5   text          224 non-null    object 
 6   subreddit     224 non-null    object 
 7   method        224 non-null    object 
dtypes: float64(1), int64(2), object(5)
memory usage: 15.8+ KB


### filter keywords

In [214]:


topic_keywords = ["altcoin", "cryptocurrency", "blockchain", "decentralized", "token", "ico", "hodl", "mining", "wallet", "exchange", "volatility", "adoption", "regulation", "etf", "moon", "sentiment", "scalability", "halving", "fud", "crypto", "bull", "bear", "asset", "fomo", "whale", "ath", "dca", "dyor", "roi", "kyc", "aml", "dex", "cex", "nft", "pow", "pos", "defi", "ta", "fa", "safu", "buidl", "rekt", "ltv", "tvl", "ieo", "sto", "ido", "airdrop", "pump and dump", "distribution", "giveaway", "token distribution", "free token giveaway", "initial coin offering", "hold on for dear life", "all time high", "fear uncertainty and doubt", "dollar cost averaging", "do your own research", "return on investment", "know your customer", "anti money laundering", "decentralized exchange", "centralized exchange", "non-fungible token", "proof of work", "proof of stake", "decentralized finance", "technical analysis", "fundamental analysis", "secure asset fund for users", "loan to value", "total value locked", "initial exchange offering", "security token offering", "initial dex offering", "distribution of tokens for free", "tokens for free", "distribution of tokens"]
comment_keywords = list(set(topic_keywords))



79
79


### get comments from methods 

In [248]:

# Function to retrieve comments for a submission
def get_comments(submission_id):
    submission = reddit.submission(id=submission_id)
    submission.comments.replace_more(limit=50)
    comments = submission.comments.list()
    comments_data = []
    errocount = 0
    try:
        for comment in comments:
            comments_data.append({
                'submission_id': submission_id,
                'comment_id': comment.id,
                'comment_body': comment.body,
                'comment_score': comment.score,
                'comment_created_utc': comment.created_utc
                # 'comment_parent_id' : comment.parent_id,
                # 'comment_replies' : comment.replies

            })
    except Exception as e:
        errocount += 1
        # print(f"Error: {e}")
        comments_data.append({
            'submission_id': submission_id,
            'comment_id': e,
            'comment_body': e,
            'comment_score': e,
            'comment_created_utc': e
        })


    return comments_data
    

# Retrieve comments for each submission and create a DataFrame
all_comments = []
try:
    for submission_id in tqdm(df1['id'], desc="Processing submissions"):
        try:
            if pd.notna(submission_id):  # Ensure submission_id is not NaN
                all_comments.extend(get_comments(submission_id))
        except Exception as e:
            print(f"Error: {e}")
            time.sleep(65)
            print('sleeping for 65 seconds')
except Exception as e:
    print(f"Error: {e}")



comments_df = pd.DataFrame(all_comments)
comments_df.to_pickle('comments2_df.pkl')

comments_df




Error: An invalid value was specified for id. Check that the argument for the id parameter is not empty.


KeyboardInterrupt: 

In [233]:
comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20637 entries, 0 to 20636
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   submission_id        20637 non-null  object 
 1   comment_id           20637 non-null  object 
 2   comment_body         20637 non-null  object 
 3   comment_score        20637 non-null  int64  
 4   comment_created_utc  20637 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 806.3+ KB


### simple filter approach to find valuable comments 
#### filter retrived comments by keywords for preprosseing 

In [230]:
import re

def clean_text(text):
    # Use regular expression to remove non-letter characters
    text = text.lower()
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

#fileter submission by keywords 
def filter_by_keywords(df, keywords,new_column_name='',input_column_name=''):
    df[new_column_name] = df[input_column_name].apply(clean_text)
    # check if any of the keywords are in the cleaned text
    df['has_keyword'] = df[new_column_name].apply(lambda x: any(keyword in x for keyword in keywords))
    #drop all columns where nokeywords are found    
    df = df[df['has_keyword']]
    return df

def time_index(df,time_column):
    df['created'] = pd.to_datetime(df[time_column],unit='s')
    df.index = df['created']
    df.drop(columns=[time_column,'created'],inplace=True)
    df.sort_index(inplace=True)
    return df

# Function to filter comments based on score threshold and print the result
def filter_and_print_comments(score_threshold,df):
    check_time_score = df[(df.index > '2025-01-14') & (df['comment_score'] > score_threshold)]
    print(f"Filtered comments with score > {score_threshold}:")
    print(len(check_time_score))


In [232]:
comments_df1 = comments_df.copy() 
print(len(comments_df1))
# keywords = ['btc','bitcoin','satoshi']

set_comments_index = time_index(comments_df1,'comment_created_utc')
filtered_comments = filter_by_keywords(comments_df1,comment_keywords, 'body_keywords', 'comment_body')
# check_time_score = filtered_comments[(filtered_comments.index > '2025-01-14')& (filtered_comments['comment_score'] > 10)]

check_time = filtered_comments[(filtered_comments.index > '2025-01-14')]

print('comment filtered',len(filtered_comments))
print('time filered',len(check_time))
filter_and_print_comments(10,filtered_comments)
filter_and_print_comments(30,filtered_comments)
filter_and_print_comments(50,filtered_comments)




20637
comment filtered 8898
time filered 279
Filtered comments with score > 10:
33
Filtered comments with score > 30:
12
Filtered comments with score > 50:
7


In [195]:
import torch
import numpy 
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ElKulako/cryptobert")
model = AutoModelForSequenceClassification.from_pretrained("ElKulako/cryptobert")

def analyze_sentiment(text):
    """
    Analyze sentiment of crypto-related text
    Returns: sentiment score between -1 (negative) and 1 (positive)
    """
    # Tokenize and prepare input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)

    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        scores = torch.nn.functional.softmax(outputs.logits, dim=1)
    
    # # scores[0] = negative
    # # scores[1] = neutral
    # # scores[2] = positive
    # sentiment_score = -1 * scores[0] + 0 * scores[1] + 1 * scores[2]

    # return sentiment_score
    return scores[0][0].item(), scores[0][1].item(), scores[0][2].item()

# Example usage
if __name__ == "__main__":
    # texts = df['processed_text'].to_list()
    texts = filtered_comments['body_keywords'].to_list()[:10]   
    for text in texts:
        sentiment = analyze_sentiment(text)
        print(f"Text: {text}")
        print(f"Sentiment score: {sentiment}\n")

Text: yay one of the biggest banks in my country  just bought 11 btc 
Sentiment score: (0.0003006509505212307, 0.27235355973243713, 0.7273457646369934)

Text: please feel free to make constructive editshttpswwwredditcomrbitcoinwikiwikirbitcoinsticky to the document they will be implemented pending mod review  also if you have any beginner questions regarding bitcoin feel free to post them in the comments below several community members are happy to help answer them  note that this thread will be moderated and nonconstructive feedback will be removed  thanks and welcome to bitcoin
Sentiment score: (0.0001637601963011548, 0.4417032301425934, 0.5581329464912415)

Text: the same bank that 4 years ago saw my bank statement and said the bank does not allow bitcoin or crypto currencies investments we cannot proceed with the granting of the mortgage 
Sentiment score: (0.003184997709468007, 0.5892398357391357, 0.40757516026496887)

Text: dang 11 btc i once had im stupid now i only have 01
Senti

In [22]:
# Search for subreddits related to a keyword
keyword = 'cryptocurrency'
subreddits = reddit.subreddits.search(keyword, limit=100)

# x = []  
# # Print the names of the subreddits found
# for subreddit in subreddits:
#     print(subreddit.display_name)
#     x.append(subreddit.display_name)

x = []  
# Print the names and additional information of the subreddits found
for subreddit in subreddits:
    print(f"Subreddit: {subreddit.display_name}")
    print(f"Subscribers: {subreddit.subscribers}")
    print(f"Active Users: {subreddit.accounts_active}")
    print(f"Description: {subreddit.public_description}")
    print("-" * 40)
    x.append(subreddit.display_name)

Subreddit: CryptoCurrency
Subscribers: 9386804
Active Users: None
Description: The leading community for cryptocurrency news, discussion, and analysis.
----------------------------------------
Subreddit: CryptocurrencyICO
Subscribers: 190448
Active Users: None
Description: r/Cryptocurrency & ICO is a hub for sharing crypto news & discussing new innovative ICO quality projects with proven utility.
----------------------------------------
Subreddit: CryptocurrencyReviews
Subscribers: 32900
Active Users: None
Description: This is a place for cryptocurrency discussion, which is the next hottest crypto? What exchange is great for trading? What are your forecasts of price? Share it here.
----------------------------------------
Subreddit: Bitcoin
Subscribers: 7555510
Active Users: None
Description: Bitcoin is the currency of the Internet: a distributed, worldwide, decentralized digital money. Unlike traditional currencies such as dollars, bitcoins are issued and managed without any central a

In [21]:
x = []  
# Print the names and additional information of the subreddits found
for subreddit in subreddits:
    print(f"Subreddit: {subreddit.display_name}")
    print(f"Subscribers: {subreddit.subscribers}")
    print(f"Active Users: {subreddit.accounts_active}")
    print(f"Description: {subreddit.public_description}")
    print("-" * 40)
    x.append(subreddit.display_name)

## example code for berttopic an spcay pre processing 


In [None]:
import praw
import pandas as pd
import spacy
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the Reddit instance
reddit = praw.Reddit(
    client_id='your_client_id',
    client_secret='your_client_secret',
    user_agent='my_reddit_app:v1.0 (by /u/your_reddit_username)'
)

# List of cryptocurrency-related subreddits
crypto_subreddits = ['cryptocurrency', 'Bitcoin', 'CryptoMarkets', 'CryptoCurrencyTrading']

# Fetch submissions from these subreddits
posts_data = []
for subreddit_name in crypto_subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    for submission in subreddit.new(limit=100):  # Fetch the newest 100 submissions
        posts_data.append({
            'title': submission.title,
            'selftext': submission.selftext,
            'score': submission.score,
            'id': submission.id,
            'url': submission.url,
            'num_comments': submission.num_comments,
            'created': submission.created_utc,
            'author': str(submission.author),
            'subreddit': subreddit_name
        })

# Create a DataFrame from the posts data
df_posts = pd.DataFrame(posts_data)
df_posts['created'] = pd.to_datetime(df_posts['created'], unit='s')

# Combine title and selftext for clustering
df_posts['text'] = df_posts['title'] + ' ' + df_posts['selftext']

# Preprocess text data using SpaCy
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

df_posts['processed_text'] = df_posts['text'].apply(preprocess_text)

# Initialize BERTopic model
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)

# Fit the model on the processed text data
topics, probabilities = topic_model.fit_transform(df_posts['processed_text'].tolist())

# Add the topics to the DataFrame
df_posts['topic'] = topics

# Save the clustered data to a CSV file
df_posts.to_csv('reddit_posts_bertopic.csv', index=False)

# Visualize the topics
topic_model.visualize_topics()