# PRICING CALCULATIONS

In [1]:
import numpy as np
import pandas as pd

from os import listdir
from os.path import isfile, join

In [2]:
pricing_files = [f for f in listdir(r'..\PricingData') if isfile(join(r'..\PricingData', f))]
coin_names = [c.split('Historical')[0] for c in pricing_files]

In [3]:
data = None
for p in pricing_files:
    curr_coin = p.split('Historical')[0]
    curr_pricing = pd.read_csv(f"..\PricingData\{p}")[['Date', 'Close/Last']]
    curr_pricing.rename(columns={'Close/Last': curr_coin+"_Close"}, inplace=True)
    
    vol_window = 7
    
    curr_pricing[curr_coin+"_Return"] = curr_pricing[curr_coin+"_Close"][::-1].pct_change()[::-1]
    curr_pricing[curr_coin+"_Vol"] = curr_pricing[curr_coin+"_Return"][::-1].rolling(vol_window).std()
    
    if data is None:
        data = curr_pricing
    else:
        data = pd.merge(data, curr_pricing, on="Date", how="outer")

In [None]:
data.to_csv("..\CleanData\pricing\coin_metrics.csv")

In [5]:
data.describe()

Unnamed: 0,Bitcoin_Close,Bitcoin_Return,Bitcoin_Vol,BNB_Close,BNB_Return,BNB_Vol,Doge_Close,Doge_Return,Doge_Vol,Ethereum_Close,Ethereum_Return,Ethereum_Vol
count,1796.0,1795.0,1789.0,1796.0,1795.0,1789.0,1796.0,1795.0,1789.0,1796.0,1795.0,1789.0
mean,30931.687472,0.002664,0.033313,264.632409,0.003445,0.040894,0.09098,0.006067,0.055533,1692.004827,0.002953,0.041218
std,20348.284242,0.04762,0.033888,208.187858,0.0542,0.035091,0.096016,0.099559,0.080772,1250.498425,0.051914,0.031381
min,3864.9,-0.332928,0.001617,8.6031,-0.445273,0.001505,0.00154,-0.282809,0.002733,116.18,-0.321036,0.003395
25%,10937.55,-0.014881,0.019733,28.130125,-0.01794,0.022958,0.003072,-0.023336,0.026044,341.175,-0.019441,0.025599
50%,26944.65,0.001674,0.027882,283.65,0.000988,0.033019,0.07196,0.0,0.038044,1670.91,0.002065,0.036207
75%,47144.45,0.017819,0.038489,416.0125,0.021868,0.047191,0.139289,0.021308,0.05906,2653.755,0.02329,0.049413
max,90015.6,1.275501,0.481298,704.09,0.938827,0.354579,0.702987,2.813023,1.0839,4796.78,1.102635,0.416667


# TEXT CLEANING

In [2]:
import re
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [3]:
# Load the tokenizer and model
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [4]:
reddit_dirs = ["..\RedditData\\" + dir for dir in listdir(r'..\RedditData') if not isfile(join(r'..\RedditData', dir))]

reddit_files = []
reddit_filenames = []
for i in reddit_dirs:
    for j in listdir(i):
        if j != ".DS_Store" and isfile(join(i,j)):
            reddit_filenames.append(j)
            reddit_files.append(pd.read_csv(join(i, j)))

In [None]:
emoji_dict = {
    "🚀": "bullish",          # Rocket
    "🔥": "trending",         # Fire
    "💎": "strong hands",     # Diamonds
    "🙌": "positive",         # Raising hands
    "📈": "price increase",   # Upward chart
    "🙂": "happy",            # Slightly Smiling face
    "😊": "happy",            # Smiling face
    "😁": "excited",          # Grinning face
    "👍": "thumbs up",        # Thumbs up
    "👏": "applause",         # Clapping hands
    "🤩": "amazing",          # Star-struck face
    "🎉": "celebration",      # Party popper
    "💰": "profit",           # Money bag
    "🥳": "success",          # Party face
    "😎": "cool",             # Smiling face with sunglasses
    "🏆": "achievement",      # Trophy
    "📉": "bearish",          # Downward chart
    "😢": "sad",              # Crying face
    "😡": "angry",            # Angry face
    "❤️": "love",
    "💔": "heartbroken",      # Broken heart
    "👎": "thumbs down",      # Thumbs down
    "😭": "very sad",         # Loudly crying face
    "🤬": "furious",          # Swearing face
    "😨": "fear",             # Fearful face
    "😤": "frustrated",       # Face with steam
    "⚠️": "warning",          # Warning sign
    "🛑": "stop",             # Stop sign
    "😱": "shocked",          # Face screaming in fear
    "🙁": "disappointed",     # Slightly frowning face
    "😞": "disheartened",     # Disappointed face
    "🤔": "thinking",         # Thinking face
    "😐": "neutral",          # Neutral face
    "🤷": "unsure",           # Person shrugging
    "🔍": "analysis",         # Magnifying glass
    "🤑": "money-focused",    # Money-mouth face
    "🏦": "bank",             # Bank building
    "✋": "pause",            # Raised hand
    "📊": "data",             # Bar chart
    "❓": "question",         # Question mark
    "❗": "important",        # Exclamation mark
    "🦍": "ape",              # Strong/bullish holder
    "🌕": "to the moon",      # Moon
    "🐻": "bearish",          # Bear
    "🐂": "bullish",          # Bull
    "🏴‍☠️": "risk",             # Pirate flag (risky move)
    "🪙": "crypto",           # Coin
    "🔒": "secure",           # Lock
    "⚡": "volatility",       # Lightning bolt
}


In [11]:
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F"  # Emoticons
    "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
    "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
    "\U0001F700-\U0001F77F"  # Alchemical Symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols & Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"  # Enclosed Characters
    "]+", 
    flags=re.UNICODE
)

def emoji_sentiment(text):
    # Replace each emoji in the text with its sentiment
    if pd.notna(text):
        for emoji, sentiment in emoji_dict.items():
            text = text.replace(emoji, sentiment)

        text = emoji_pattern.sub("", text)
    return text

# Define max_length based on the model's capacity
max_length = 512

device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Function to predict sentiment
def predict_sentiment(text):
    if not isinstance(text, str):
        return "neutral", 0.0
        
    # Tokenize the input text with truncation and padding
    inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"  # Return PyTorch tensors
    ).to(device)
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    # Apply softmax to get probabilities
    scores = outputs[0][0].cpu().numpy()
    scores = softmax(scores)
    # Get the label with the highest score
    labels = ['negative', 'neutral', 'positive']
    max_score_index = scores.argmax()
    sentiment = labels[max_score_index]
    confidence = scores[max_score_index]
    return sentiment, confidence

In [7]:
reddit_files[0]

Unnamed: 0,post_id,comment_id,comment_body,comment_score,comment_created,comment_timestamp,subreddit
0,1gthjwp,lxpebqk,"If we're in a parábola, wouldn't it stay at th...",2,2024-11-17 21:59:46,1.731902e+09,altcoin
1,1gthjwp,lxngblp,I’m seeing 82 on CMC,1,2024-11-17 15:03:35,1.731877e+09,altcoin
2,1gthjwp,lxnposm,"Sorry, what is CMC? Newbie here :)",1,2024-11-17 15:54:38,1.731880e+09,altcoin
3,1gthjwp,lxoman0,Coin market cap website,1,2024-11-17 19:02:16,1.731892e+09,altcoin
4,1gtcyw0,lxo9tsf,Check out https://coinmarketcap.com/currencies...,1,2024-11-17 17:49:42,1.731887e+09,altcoin
...,...,...,...,...,...,...,...
16662,1e2xdry,ld47vo9,seed is the secret.\n\nThey address helps if t...,1,2024-07-14 04:32:17,1.720950e+09,BitcoinBeginners
16663,1e2xdry,ld4ddnr,Seed contains private keys and all your wallet...,1,2024-07-14 05:35:46,1.720953e+09,BitcoinBeginners
16664,1e2xdry,ld5uvhm,The seed phrase (12 or 24 random words) is all...,1,2024-07-14 11:55:58,1.720976e+09,BitcoinBeginners
16665,1e2xdry,ld9odgk,The seed phrase gets the hacker full access. T...,1,2024-07-15 04:53:12,1.721037e+09,BitcoinBeginners


In [None]:
# MAKE SURE TO HAVE GPU SET UP BEFORE RUNNING, TAKES FOREVER ON CPU

# for file in range(len(reddit_files)):
#     print("Cleaning file ", file)
#     i = reddit_files[file]
#     if 'title' in i.columns:
#         i['text'] = i['title'] + i['body']
#         i['text'] = i['text'].apply(emoji_sentiment)
#         i['text'] = i['text'].fillna("").astype(str)
#         i[['sentiment', 'confidence']] = i['text'].apply(lambda x: pd.Series(predict_sentiment(x)))
#         i['weight'] = i['score']
#         i['date'] = pd.to_datetime(i['created']).dt.date
#     elif 'comment_body' in i.columns:
#         i['text'] = i['comment_body'].apply(emoji_sentiment)
#         i['text'] = i['text'].fillna("").astype(str)
#         i[['sentiment', 'confidence']] = i['text'].apply(lambda x: pd.Series(predict_sentiment(x)))
#         i['weight'] = i['comment_score']
#         i['date'] = pd.to_datetime(i['comment_created']).dt.date
      
#     reddit_files[file] = i[['date', 'text', 'sentiment', 'confidence', 'weight']]

In [38]:
reddit_files[0]

Unnamed: 0,date,text,weight
0,2024-11-17,"If we're in a parábola, wouldn't it stay at th...",2
1,2024-11-17,I’m seeing 82 on CMC,1
2,2024-11-17,"Sorry, what is CMC? Newbie here :)",1
3,2024-11-17,Coin market cap website,1
4,2024-11-17,Check out https://coinmarketcap.com/currencies...,1
...,...,...,...
16662,2024-07-14,seed is the secret.\n\nThey address helps if t...,1
16663,2024-07-14,Seed contains private keys and all your wallet...,1
16664,2024-07-14,The seed phrase (12 or 24 random words) is all...,1
16665,2024-07-15,The seed phrase gets the hacker full access. T...,1


In [None]:
# for i in range(len(reddit_files)):
#     reddit_files[i].to_csv(f"..\CleanData\text\{reddit_filenames[i]}")

In [39]:
cleaned_text = [pd.read_csv('..\CleanData\\text\\' + file) for file in listdir(r'..\CleanData\text')]

In [40]:
def numerical_sentiment(sentiment):
    if sentiment == "positive":
        return 1
    elif sentiment == "negative":
        return -1
    return 0

In [None]:
total_sentiment = pd.concat(cleaned_text)
total_sentiment["conviction"] = total_sentiment["sentiment"].apply(numerical_sentiment) * total_sentiment["confidence"]
total_sentiment['weighted conviction'] = total_sentiment["conviction"] * total_sentiment["weight"]

In [73]:
final_data = total_sentiment.groupby('date')[['weight', 'weighted conviction']].sum()
final_data['normalized weighted conviction'] = final_data['weighted conviction'] / final_data['weight']

In [79]:
final_data['conviction avg'] = total_sentiment.groupby('date')['conviction'].mean()
final_data['conviction std'] = total_sentiment.groupby('date')['conviction'].std()
final_data['weighted conviction avg'] = total_sentiment.groupby('date')['weighted conviction'].mean()
final_data['weighted conviction std'] = total_sentiment.groupby('date')['weighted conviction'].std()

# rationale here is that std is only NAN when there is only 1 datapoint for the day, thus there is no deviation
final_data.fillna(0, inplace=True)

In [84]:
final_data.to_csv(r"..\CleanData\text\metrics\metrics.csv")

# COMBINE PRICING AND TEXT

In [130]:
metrics = pd.read_csv(r"..\CleanData\text\metrics\metrics.csv")
pricing = pd.read_csv(r"..\CleanData\pricing\coin_metrics.csv")

In [131]:
metrics.rename(columns={'date': 'Date'}, inplace=True)
metrics['Date'] = pd.to_datetime(metrics['Date'], errors='coerce')
pricing['Date'] = pd.to_datetime(pricing['Date'], errors='coerce')

In [133]:
pd.merge(pricing, metrics, on="Date", how="left").sort_values(by='Date')

Unnamed: 0.1,Unnamed: 0,Date,Bitcoin_Close,Bitcoin_Return,Bitcoin_Vol,BNB_Close,BNB_Return,BNB_Vol,Doge_Close,Doge_Return,...,Ethereum_Close,Ethereum_Return,Ethereum_Vol,weight,weighted conviction,normalized weighted conviction,conviction avg,conviction std,weighted conviction avg,weighted conviction std
392,264,2019-03-08,3865.9,,,14.2897,,,0.001949,,...,133.85,,,,,,,,,
401,270,2019-03-09,3944.4,0.020306,,14.4651,0.012275,,0.001978,0.014879,...,137.52,0.027419,,,,,,,,
410,276,2019-03-10,3915.2,-0.007403,,14.2700,-0.013488,,0.001961,-0.008595,...,135.72,-0.013089,,,,,,,,
418,282,2019-03-11,3865.1,-0.012796,,14.3498,0.005592,,0.001940,-0.010709,...,133.42,-0.016947,,,,,,,,
426,288,2019-03-12,3886.0,0.005407,,15.3400,0.069004,,0.001995,0.028351,...,133.80,0.002848,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2137,1598,2024-11-12,86313.8,-0.027211,0.041142,599.8400,-0.078176,0.043601,0.355835,-0.097829,...,3156.75,-0.055795,0.044676,75449.0,40895.668860,0.542031,0.734035,0.347952,12.167709,96.485719
2144,1603,2024-11-13,90015.6,0.042888,0.040718,630.6000,0.051280,0.045761,0.392876,0.104096,...,3232.51,0.023999,0.034718,85728.0,48883.336230,0.570214,0.729796,0.355060,11.266037,82.484825
2143,1603,2024-11-13,90015.6,0.042888,0.040718,630.6000,0.051280,0.045761,0.392876,0.104096,...,3232.51,0.023999,0.034718,3121.0,2606.369325,0.835107,0.774988,0.359990,78.980889,304.579140
2150,1608,2024-11-14,87221.6,-0.031039,0.046462,609.5500,-0.033381,0.048271,0.356423,-0.092785,...,3023.95,-0.064520,0.046723,49787.0,29622.551170,0.594986,0.739009,0.351991,7.514599,45.919238
