# PRICING CALCULATIONS

In [1]:
import numpy as np
import pandas as pd

from os import listdir
from os.path import isfile, join

In [2]:
pricing_files = [f for f in listdir(r'..\PricingData') if isfile(join(r'..\PricingData', f))]
coin_names = [c.split('Historical')[0] for c in pricing_files]

In [3]:
data = None
for p in pricing_files:
    curr_coin = p.split('Historical')[0]
    curr_pricing = pd.read_csv(f"..\PricingData\{p}")[['Date', 'Close/Last']]
    curr_pricing.rename(columns={'Close/Last': curr_coin+"_Close"}, inplace=True)
    
    vol_window = 7
    
    curr_pricing[curr_coin+"_Return"] = curr_pricing[curr_coin+"_Close"][::-1].pct_change()[::-1]
    curr_pricing[curr_coin+"_Vol"] = curr_pricing[curr_coin+"_Return"][::-1].rolling(vol_window).std()
    
    if data is None:
        data = curr_pricing
    else:
        data = pd.merge(data, curr_pricing, on="Date", how="outer")

In [4]:
data.to_csv("..\CleanData\coin_metrics.csv")

In [5]:
data.describe()

Unnamed: 0,Bitcoin_Close,Bitcoin_Return,Bitcoin_Vol,BNB_Close,BNB_Return,BNB_Vol,Doge_Close,Doge_Return,Doge_Vol,Ethereum_Close,Ethereum_Return,Ethereum_Vol
count,1796.0,1795.0,1789.0,1796.0,1795.0,1789.0,1796.0,1795.0,1789.0,1796.0,1795.0,1789.0
mean,30931.687472,0.002664,0.033313,264.632409,0.003445,0.040894,0.09098,0.006067,0.055533,1692.004827,0.002953,0.041218
std,20348.284242,0.04762,0.033888,208.187858,0.0542,0.035091,0.096016,0.099559,0.080772,1250.498425,0.051914,0.031381
min,3864.9,-0.332928,0.001617,8.6031,-0.445273,0.001505,0.00154,-0.282809,0.002733,116.18,-0.321036,0.003395
25%,10937.55,-0.014881,0.019733,28.130125,-0.01794,0.022958,0.003072,-0.023336,0.026044,341.175,-0.019441,0.025599
50%,26944.65,0.001674,0.027882,283.65,0.000988,0.033019,0.07196,0.0,0.038044,1670.91,0.002065,0.036207
75%,47144.45,0.017819,0.038489,416.0125,0.021868,0.047191,0.139289,0.021308,0.05906,2653.755,0.02329,0.049413
max,90015.6,1.275501,0.481298,704.09,0.938827,0.354579,0.702987,2.813023,1.0839,4796.78,1.102635,0.416667


# TEXT CLEANING

In [19]:
import re
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [20]:
# Load the tokenizer and model
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [29]:
reddit_dirs = ["..\RedditData\\" + dir for dir in listdir(r'..\RedditData') if not isfile(join(r'..\RedditData', dir))]

reddit_files = []
for i in reddit_dirs:
    for j in listdir(i):
        if j != ".DS_Store" and isfile(join(i,j)):
            reddit_files.append(pd.read_csv(join(i, j)))

In [30]:
emoji_dict = {
    "🚀": "bullish",          # Rocket
    "🔥": "trending",         # Fire
    "💎": "strong hands",     # Diamonds
    "🙌": "positive",         # Raising hands
    "📈": "price increase",   # Upward chart
    "🙂": "happy",            # Slightly Smiling face
    "😊": "happy",            # Smiling face
    "😁": "excited",          # Grinning face
    "👍": "thumbs up",        # Thumbs up
    "👏": "applause",         # Clapping hands
    "🤩": "amazing",          # Star-struck face
    "🎉": "celebration",      # Party popper
    "💰": "profit",           # Money bag
    "🥳": "success",          # Party face
    "😎": "cool",             # Smiling face with sunglasses
    "🏆": "achievement",      # Trophy
    "📉": "bearish",          # Downward chart
    "😢": "sad",              # Crying face
    "😡": "angry",            # Angry face
    "💔": "heartbroken",      # Broken heart
    "👎": "thumbs down",      # Thumbs down
    "😭": "very sad",         # Loudly crying face
    "🤬": "furious",          # Swearing face
    "😨": "fear",             # Fearful face
    "😤": "frustrated",       # Face with steam
    "⚠️": "warning",          # Warning sign
    "🛑": "stop",             # Stop sign
    "😱": "shocked",          # Face screaming in fear
    "🙁": "disappointed",     # Slightly frowning face
    "😞": "disheartened",     # Disappointed face
    "🤔": "thinking",         # Thinking face
    "😐": "neutral",          # Neutral face
    "🤷": "unsure",           # Person shrugging
    "🔍": "analysis",         # Magnifying glass
    "🤑": "money-focused",    # Money-mouth face
    "🏦": "bank",             # Bank building
    "✋": "pause",            # Raised hand
    "📊": "data",             # Bar chart
    "❓": "question",         # Question mark
    "❗": "important",        # Exclamation mark
    "🦍": "ape",              # Strong/bullish holder
    "🌕": "to the moon",      # Moon
    "🐻": "bearish",          # Bear
    "🐂": "bullish",          # Bull
    "🏴‍☠️": "risk",             # Pirate flag (risky move)
    "🪙": "crypto",           # Coin
    "🔒": "secure",           # Lock
    "⚡": "volatility",       # Lightning bolt
}


In [31]:
def emoji_sentiment(text):
    # Replace each emoji in the text with its sentiment
    if pd.notna(text):
        for emoji, sentiment in emoji_dict.items():
            text = text.replace(emoji, sentiment)
    return text

# Define max_length based on the model's capacity
max_length = 512

# Function to predict sentiment
def predict_sentiment(text):
    # Tokenize the input text with truncation and padding
    inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"  # Return PyTorch tensors
    )
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    # Apply softmax to get probabilities
    scores = outputs[0][0].numpy()
    scores = softmax(scores)
    # Get the label with the highest score
    labels = ['negative', 'neutral', 'positive']
    max_score_index = scores.argmax()
    sentiment = labels[max_score_index]
    confidence = scores[max_score_index]
    return sentiment, confidence


In [34]:
reddit_files[0]

Unnamed: 0,post_id,comment_id,comment_body,comment_score,comment_created,comment_timestamp,subreddit
0,1gthjwp,lxpebqk,"If we're in a parábola, wouldn't it stay at th...",2,2024-11-17 21:59:46,1.731902e+09,altcoin
1,1gthjwp,lxngblp,I’m seeing 82 on CMC,1,2024-11-17 15:03:35,1.731877e+09,altcoin
2,1gthjwp,lxnposm,"Sorry, what is CMC? Newbie here :)",1,2024-11-17 15:54:38,1.731880e+09,altcoin
3,1gthjwp,lxoman0,Coin market cap website,1,2024-11-17 19:02:16,1.731892e+09,altcoin
4,1gtcyw0,lxo9tsf,Check out https://coinmarketcap.com/currencies...,1,2024-11-17 17:49:42,1.731887e+09,altcoin
...,...,...,...,...,...,...,...
16662,1e2xdry,ld47vo9,seed is the secret.\n\nThey address helps if t...,1,2024-07-14 04:32:17,1.720950e+09,BitcoinBeginners
16663,1e2xdry,ld4ddnr,Seed contains private keys and all your wallet...,1,2024-07-14 05:35:46,1.720953e+09,BitcoinBeginners
16664,1e2xdry,ld5uvhm,The seed phrase (12 or 24 random words) is all...,1,2024-07-14 11:55:58,1.720976e+09,BitcoinBeginners
16665,1e2xdry,ld9odgk,The seed phrase gets the hacker full access. T...,1,2024-07-15 04:53:12,1.721037e+09,BitcoinBeginners


In [35]:
i = reddit_files[0]
if 'title' in i.columns:
    i['text'] = i['title'] + i['body']
    i['text'] = i['text'].apply(emoji_sentiment)
    i[['sentiment', 'confidence']] = i['text'].apply(lambda x: pd.Series(predict_sentiment(x)))
    i['weight'] = i['score'] + ". " + i['num_comments']
    i['date'] = pd.to_datetime(i['created']).dt.date
else:
    i['text'] = i['comment_body'].apply(emoji_sentiment)
    i[['sentiment', 'confidence']] = i['text'].apply(lambda x: pd.Series(predict_sentiment(x)))
    i['weight'] = i['comment_score']
    i['date'] = pd.to_datetime(i['comment_created']).dt.date

KeyboardInterrupt: 

In [None]:
for i in reddit_files:
    if 'title' in i.columns:
        i['text'] = i['title'] + i['body']
        i['text'] = i['text'].apply(emoji_sentiment)
        i[['sentiment', 'confidence']] = i['text'].apply(lambda x: pd.Series(predict_sentiment(x)))
        i['weight'] = i['score'] + ". " + i['num_comments']
        i['date'] = pd.to_datetime(i['created']).dt.date
    else:
        i['text'] = i['comment_body'].apply(emoji_sentiment)
        i[['sentiment', 'confidence']] = i['text'].apply(lambda x: pd.Series(predict_sentiment(x)))
        i['weight'] = i['comment_score']
        i['date'] = pd.to_datetime(i['comment_created']).dt.date

post_id


AttributeError: 'str' object has no attribute 'columns'

In [None]:
reddit_files[0]

# COMBINE PRICING AND TEXT