In [1]:
import math
import pandas as pd
from tqdm.notebook import tqdm as tqdm
from transformers import pipeline
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# from vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from data_utils import load_reddit_data
from utils import set_pandas_display_options

In [2]:
%load_ext autoreload
%autoreload 2   

In [3]:
set_pandas_display_options()

In [None]:
# load corpus
df = load_reddit_data()

# Prepare corpus

In [12]:
# remove missings
df = df.dropna(subset = "selftext")

# filter corpus
df = df[df["removed"] != 1]
df = df[df["author"] != "rBitcoinMod"]
df = df[df["num_comments"] > 10]


In [13]:
df.shape

(7978, 25)

In [14]:
# convert to datetime
df["datetime"] = pd.to_datetime(df["created"], unit="s")\

# Set 'datetime' column as the index
# assert len(df["datetime"].unique()) == df.shape[0]
# df.set_index('datetime', inplace=True, drop = True) # dont do this as datetime is not unique

# Relevance of posts
Classify whether posts are related to **bitcoin trading**

Mistral LLM automatic tagging

In [15]:
# mistral model

# mist = mistral_relevant_text_identification(df, "selftext")
# mist.to_csv("mistral_tagging.csv", index=False)

# load checkpoint
mist = pd.read_csv("mistral_tagging.csv")

simple keyword search

In [16]:
KW_BITCOIN = ["bitcoin", "btc"]
KW_TRADING = ["trading", "trade", "market", "price", "value", "money", "indicator", "future", "derivative", "news",
              "invest", "buy", "bought", "sell", "sold", "win", "won", "bull", "bear", "pump", "dump",
              "hodl", "moon", "walls", "bagholder", "whale", "fomo"]
# TODO: are wallet posts trading related?
# KW_WALLET = ["wallet"]

df["contains_kw_bitcoin"] = df["selftext"].str.lower().str.contains("|".join(KW_BITCOIN))
df["contains_kw_trading"] = df["selftext"].str.lower().str.contains("|".join(KW_TRADING))
# df["contains_kw_wallet"] = df["selftext"].str.lower().str.contains("|".join(KW_WALLET))
df["contains_kw_bitcoin_trading"] = df["contains_kw_bitcoin"] & df["contains_kw_trading"] # & ~df["contains_kw_wallet"]

In [17]:
df["contains_kw_bitcoin_trading"].value_counts()

contains_kw_bitcoin_trading
True     4029
False    3949
Name: count, dtype: int64

In [18]:
# df["selftext"].str.split(" ").explode().value_counts().head(100)

In [19]:
df[["contains_kw_bitcoin_trading", "selftext"]].head()

Unnamed: 0,contains_kw_bitcoin_trading,selftext
6,True,"I only have .03 btc but I still feel early as I'm the only one in my family or friendgroup to have any btc.\n\nIf you, in the start of 2022, know enough about BTC to have some already, you're EARLY. I know we're all sad we didn't buy at .o2 cents (or whatever it was) but it's NEVER too late to start making better decisions. \n\nSo here's to all my Sat Stackers who are slowly, humbly, going to make it. \n\nCheers! Here's to BTC at any cost!"
23,True,"Just a genuine question, Iâ€™ve googled biggest bitcoin wallets and I could only see Binance and a few other exchanges at the top. With 1 person in the billions but no where near the amount Satoshi has which I heard is 1million bitcoins? Anyway are his bitcoins counted as lost? Or still part of the supply? If it was factually announced they were lost forever somehow this would surge price because of the [law of demand](https://www.investopedia.com/terms/l/law-of-supply-demand.asp). Also would it be plausible for him to have split his bitcoins across multiple wallets and drain each wallet individually? I understand transaction history has Time stamps but Iâ€™m not sure if that was a thing in 2010\n\n So my question is Satoshi are you dead and if your not dead where are your bitcoins?"
32,True,If people see the value going up then why is it a must as someone whoâ€™s not invested in Bitcoin to wanna do it now. Wouldnâ€™t other crypto coins be more valuable to those who havenâ€™t yet.
43,False,"Thank god it is on sale, idk that I could take it if I had to pay so much for it this soon."
47,True,"I bought $1,000 AUD in BTC on 29th Jan 2021.\n\n[https://www.blockchain.com/btc/address/1GGqWYmYK78xEVxe9Xm6Fm8d1gd9rYMn9p](https://www.blockchain.com/btc/address/1GGqWYmYK78xEVxe9Xm6Fm8d1gd9rYMn9p)\n\nWhen I try to access the bitcoin wallet I can't.\n\nCan anyone help with this?\n\nThanks"


compare performance: expect mistral to positively classify finds of keyword search 

In [20]:
df["submission"] = df["submission"].astype(str)
mist["submission"] = mist["submission"].astype(str)
bench = pd.merge(df, mist[["submission", "mistral_tagging"]], on="submission")

In [21]:
bench.loc[bench["contains_kw_bitcoin_trading"], "mistral_tagging"].value_counts(dropna=False)   

mistral_tagging
NaN    3720
0.0     301
1.0       8
Name: count, dtype: int64

# Sentiment of posts
Calculate sentiment label and score 

huggingface/distilroberta

In [4]:
# MODEL = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
# sentiment_task = pipeline("sentiment-analysis", model=MODEL, tokenizer=MODEL)

def remove_special_chars(text):
    return ''.join(e for e in text if e.isalnum() or e.isspace())

# sentiments_label = []
# sentiments_score = []
# for txt in tqdm(df["selftext"].values):
#     txt = remove_special_chars(txt)
#     try:
#         sentiment = sentiment_task(txt)
#         sentiments_label.append(sentiment[0]["label"])
#         sentiments_score.append(sentiment[0]["score"])
#     except Exception as e:
#         sentiments_label.append("N/A")
#         sentiments_score.append(0)

# load checkpoint
roberta_sent = pd.read_csv("bitcoin_2022/sentiment.csv")

In [5]:
roberta_sent["sentiment_label"].value_counts()

sentiment_label
neutral     12655
positive     1524
negative     1393
Name: count, dtype: int64

nltk + vaderSentiment

In [31]:
# nltk.download('all')

In [27]:
# create preprocess_text function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

# apply the function df

df['selftext_processed'] = df['selftext'].apply(preprocess_text)

In [78]:
# initialize NLTK sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# get sentiment scores
df["sentiment_scores_vader"] = df['selftext_processed'].apply(analyzer.polarity_scores)

# get maximum score label
def get_sentiment_label(scores):
    del scores["compound"]
    label = max(scores, key=scores.get)
    return label
df["sentiment_label_vader"] = df["sentiment_scores_vader"].apply(get_sentiment_label)

# get score for maximum scored label
df["sentiment_score_vader"] = df[["sentiment_scores_vader", "sentiment_label_vader"]].apply(lambda x: x["sentiment_scores_vader"][x["sentiment_label_vader"]], axis=1) 

In [79]:
df["sentiment_label_vader"].value_counts()

sentiment_label_vader
neu    7743
pos     172
neg      63
Name: count, dtype: int64

In [80]:
df[df["sentiment_label_vader"]=="pos"] # mostely very short texts

Unnamed: 0,submission,subreddit,author,created,retrieved,edited,pinned,archived,locked,removed,...,thumbnail,shortlink,datetime,contains_kw_bitcoin,contains_kw_trading,contains_kw_bitcoin_trading,selftext_processed,sentiment_scores_vader,sentiment_label_vader,sentiment_score_vader
112,rtos7o,bitcoin,KNOWYOURs3lf,1641060542,1641140399,0,0,0,0,0,...,self,https://redd.it/rtos7o,2022-01-01 18:09:02,False,False,False,thanks input .,"{'neg': 0.0, 'neu': 0.256, 'pos': 0.744}",pos,0.744
249,ru78mb,bitcoin,Altruistic_Baker_423,1641120269,1641165997,0,0,0,0,0,...,self,https://redd.it/ru78mb,2022-01-02 10:44:29,False,False,False,"thank answer , stay safe & truly game-changing 2022 ! ðŸ––","{'neg': 0.0, 'neu': 0.318, 'pos': 0.682}",pos,0.682
650,rvmajx,bitcoin,mjbasty1,1641272877,1641316710,0,0,0,0,0,...,self,https://redd.it/rvmajx,2022-01-04 05:07:57,False,False,False,successfully taken ownership key . thanks sub push knowledge . feel great .,"{'neg': 0.0, 'neu': 0.407, 'pos': 0.593}",pos,0.593
1192,rx81am,bitcoin,iPostOnlyWhenHigh,1641451611,1641495310,0,0,0,0,0,...,self,https://redd.it/rx81am,2022-01-06 06:46:51,False,True,False,? bought day . â€™ welcome .,"{'neg': 0.0, 'neu': 0.4, 'pos': 0.6}",pos,0.600
1579,ry4p6q,bitcoin,perla-madonna,1641552266,1641596829,0,0,0,0,0,...,self,https://redd.it/ry4p6q,2022-01-07 10:44:26,False,True,False,look like â€˜ good point sell,"{'neg': 0.0, 'neu': 0.357, 'pos': 0.643}",pos,0.643
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66306,zxj33r,bitcoin,Fluffy_Ad_2277,1672257096,1672305799,0,0,0,0,0,...,self,https://redd.it/zxj33r,2022-12-28 19:51:36,False,False,False,thanks advance .,"{'neg': 0.0, 'neu': 0.256, 'pos': 0.744}",pos,0.744
66314,zxjxds,bitcoin,MessierEigthySeven,1672259078,1672305799,0,0,0,0,0,...,self,https://redd.it/zxjxds,2022-12-28 20:24:38,True,False,False,want make sure defend myself/bitcoin ; ),"{'neg': 0.0, 'neu': 0.455, 'pos': 0.545}",pos,0.545
66475,zz2xlv,bitcoin,fionda08v2,1672413395,1672496405,0,0,0,0,0,...,self,https://redd.it/zz2xlv,2022-12-30 15:16:35,True,False,False,good afternoon btc community . would like ask think best way safely store btc . think paper wallet best hard wallet ?,"{'neg': 0.045, 'neu': 0.414, 'pos': 0.541}",pos,0.541
66492,zz9ttn,bitcoin,Unknownpersona1111,1672430200,1672496407,0,0,0,0,0,...,self,https://redd.it/zz9ttn,2022-12-30 19:56:40,False,False,False,skewed thing ? thanks,"{'neg': 0.0, 'neu': 0.408, 'pos': 0.592}",pos,0.592


In [6]:
# df.to_csv("bitcoin_2022/sentiment_vader.csv")
df = pd.read_csv("bitcoin_2022/sentiment_vader.csv")

Human Eval of sentiment analysis

In [7]:
eval_full = pd.merge(
    df.loc[
        # filter on bitcoin trading
        df["contains_kw_bitcoin_trading"], ["submission", "selftext", "contains_kw_bitcoin_trading", "selftext_processed", "sentiment_scores_vader", "sentiment_label_vader", "sentiment_score_vader"]], 
    roberta_sent[["submission", "sentiment_label", "sentiment_score"]], on=["submission"], how="inner")

In [8]:
n_sample = 100
eval_template = eval_full[["submission", "selftext"]].sample(n_sample).copy()
eval_template["human_label"] = pd.NA

In [11]:
import math
n_splits = 2
for i in range(2):
    split_size = math.floor(n_sample / n_splits)
    eval_template.iloc[split_size*i : split_size*(i+1)].to_csv(f"bitcoin_2022/human_sent_eval_{i}.csv", index=False)