In [None]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

import pickle
import nltk
import numpy as np
import pandas as pd
from datetime import datetime

from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

from src import clean_text, get_sentiment

In [31]:
nltk.download('averaged_perceptron_tagger', download_dir='./nltk_data')
nltk.download('punkt', download_dir='./nltk_data')
nltk.download('punkt_tab', download_dir='./nltk_data')
nltk.download('stopwords', download_dir='./nltk_data')
nltk.download('wordnet', download_dir='./nltk_data')
nltk.data.path.append('./nltk_data')
nltk.data.path.append('/home/hyunsu/project/ku_stat/nltk_data')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     ./nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to ./nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to ./nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ./nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
## DistilBERT
model_checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
distil_bert_model = pipeline(task="sentiment-analysis", model=model_checkpoint)

## FinBERT
model_checkpoint = "yiyanghkust/finbert-tone"
finbert = BertForSequenceClassification.from_pretrained(model_checkpoint,num_labels=3)
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
finbert_model = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

  return torch.load(checkpoint_file, map_location="cpu")


In [33]:
def convert_utc(utc_time):
    return datetime.utcfromtimestamp(utc_time)

In [42]:
def top_posts_subreddit_pipeline(reddit_data, sentiment_model):
    df = pd.DataFrame(reddit_data)
    df['all_text'] = df['title'] + df['selftext']
    df['clean_title'] = df['all_text'].apply(lambda x : clean_text(x))
    df = get_sentiment(df, 'clean_title', sentiment_model)
    df['timestamp'] = df['created_utc'].apply(convert_utc)

    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day'] = df['timestamp'].dt.day

    return df

In [43]:
def comments_pipeline(df, comment_column, column_to_clean, sentiment_model):
    # Get comment data from the pipeline_subreddit
    if comment_column in df.columns:
        comments_df = pd.DataFrame(df[comment_column][df.index[0]])
        if column_to_clean in comments_df.columns:
            comments_df[f'clean_{column_to_clean}'] = comments_df[column_to_clean].apply(lambda x : clean_text(x))
            comments_df = get_sentiment(comments_df, f'clean_{column_to_clean}', sentiment_model) 
            comments_df['timestamp'] = comments_df['created_utc'].apply(convert_utc)
            return comments_df
        else:
            return comments_df
    else: return df

In [44]:
def analyze_sentiment(df, post_id):
    return df[df['title'] == post_id]

In [45]:
file_path = './reddit_data/reddit_data_stoks_hot_10.pkl'
with open(file_path, 'rb') as f:
    reddit_data_sample = pickle.load(f)

In [46]:
reddit_data_sample_transformed = top_posts_subreddit_pipeline(reddit_data_sample, sentiment_model = finbert_model)

In [47]:
reddit_data_sample_comment_transformed = comments_pipeline(reddit_data_sample_transformed, 'comments', 'body', sentiment_model = finbert_model)

In [48]:
reddit_data_sample_comment_transformed

Unnamed: 0,body,author,score,created_utc,is_top_level,parent_id,depth,gilded,clean_body,sentiment_clean_body_label,sentiment_clean_body_score,timestamp
0,Late 20s. Decided to keep it simple on splitti...,Hariharan235,13,1.744035e+09,True,t3_1j0w73o,0,0,late decide keep simple splitting position adj...,Neutral,0.999952,2025-04-07 14:13:26
1,100% S&P and bricking it,Jimlad73,14,1.741096e+09,True,t3_1j0w73o,0,0,sp bricke,Neutral,0.992054,2025-03-04 13:54:48
2,"100% 6-month tbills, lol :)",inopia,11,1.741652e+09,True,t3_1j0w73o,0,0,month tbill lol,Neutral,0.999989,2025-03-11 00:18:18
3,Ticker & %\n\nSPGI: 18.2%\n\nASML: 15.2%\n\nGO...,elgrandorado,7,1.741881e+09,True,t3_1j0w73o,0,0,ticker spgi asml goog mco hwkn ntdoy v manh,Neutral,0.999991,2025-03-13 15:57:56
4,VOO 19.5%\n\nTSLA 19%\n\nGOOG 14.5%\n\nBRK.B 4...,FromTheBottomO_o,7,1.745790e+09,True,t3_1j0w73o,0,0,voo tsla goog brkb amzn aapl nvda,Neutral,0.999992,2025-04-27 21:36:49
...,...,...,...,...,...,...,...,...,...,...,...,...
282,I’m sorry but being able to understand a balan...,Short-Philosophy-105,1,1.741077e+09,False,t1_mfxgp3k,4,0,sorry able understand balance sheet analyse co...,Neutral,0.999349,2025-03-04 08:22:04
283,[deleted],,1,1.741077e+09,False,t1_mfxgwyz,5,0,delete,Neutral,0.986192,2025-03-04 08:24:55
284,[deleted],,1,1.741077e+09,False,t1_mfxh6pr,6,0,delete,Neutral,0.986192,2025-03-04 08:27:32
285,[deleted],,1,1.741077e+09,False,t1_mfxhfgm,7,0,delete,Neutral,0.986192,2025-03-04 08:29:00
