In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
from scipy.ndimage import gaussian_filter1d

In [3]:
# read files
df_money = pd.read_pickle("../data/cleaned/by_id_verified_money_2020-2021.pkl")
df_tweet = pd.read_pickle("../data/cleaned/tweets_verified_2020-2021_cleaned.pkl")
df_bitcoin = pd.read_csv("../data/raw/bitcoin_value_2020-2021.csv", parse_dates=["datetime"])

# get differential of bitcoin value
df_bitcoin['diff'] = df_bitcoin['close_bitcoin'].diff()

In [None]:
# merge tweets and sentiment values
df_tweet_money = pd.merge(df_tweet, df_money, how='left', on='id')
df_tweet_money['money'] = df_tweet_money['money'].fillna(0)

# sum metrics
df_tweet_money['sum_metrics'] = df_tweet_money['replies_count'] + \
                                    df_tweet_money['retweets_count'] + \
                                    df_tweet_money['likes_count']

# get weights
df_freq = df_tweet_money.groupby(['sum_metrics']).agg({'id':'count'}).reset_index()
df_freq = df_freq.rename(columns={'id': 'freq_tweets'})

freq_max = df_freq['freq_tweets'].max()
freq_min = df_freq['freq_tweets'].min()

df_freq['freq_tweets_norm'] = (df_freq['freq_tweets'] - freq_min) / (freq_max - freq_min)
df_freq['weights'] = 1 - df_freq['freq_tweets_norm']

display(df_freq)

In [None]:
_ = plt.figure(figsize=(10, 7))

_ = plt.bar(df_freq['sum_metrics'][0:50], df_freq['freq_tweets_norm'][0:50], label='freq_tweets_norm', alpha=0.5)
_ = plt.bar(df_freq['sum_metrics'][0:50], df_freq['weights'][0:50], label='weights', alpha=0.5)

_ = plt.legend()
_ = plt.xlabel("Sums of Metrics")
_ = plt.ylabel("Frequency of Tweets (Normalized)")

_ = plt.savefig("figures/sum_metr_vs_freq.svg", format='svg')
_ = plt.show()

In [None]:
# merge weights with 
df = pd.merge(df_tweet_money,\
              df_freq[['sum_metrics', 'weights']], \
              how='left', \
              on='sum_metrics')

# get weighted sentiments
df['weighted_money'] = df['money'] * df['weights']

display(df[['money', 'sum_metrics', 'weights', 'weighted_money']].head())

In [None]:
# group by date and get the mean of weighted_sentiment
df_by_date = df.groupby(df['created_at'].dt.date).agg({'weighted_money': 'mean'}).reset_index()

display(df_by_date.head())

In [None]:
# clean data before export
df_by_date["created_at"] = pd.to_datetime(df_by_date["created_at"])
df_freq = df_freq.rename(columns={'created_at': 'datetime'})

# export as pickle
df_by_date.to_pickle("../data/cleaned/by_date_verified_weighted_money_2020-2021.pkl")

In [None]:
# try reading pickle
df_test = pd.read_pickle("../data/cleaned/by_date_verified_weighted_money_2020-2021.pkl")

display(df_test.dtypes)
display(df_test.head())

In [None]:
def plot_sent_vs_bit(from_date, to_date, sigma, save_to=None):
    fig, ax1 = plt.subplots(figsize=(10, 7))

    _ = ax1.set_xlabel("Time (Month)")
    lns1 = ax1.plot(df_by_date['created_at'],
                    gaussian_filter1d(df_by_date['weighted_money'], sigma), 
                    color='blue',
                    label='Weighted Money')
    _ = ax1.set_ylabel("Weighted Money")

    ax2 = ax1.twinx()
    lns2 = ax2.plot(df_bitcoin['datetime'],
                    gaussian_filter1d(df_bitcoin['diff'], sigma),
                    color='orange',
                    label='Diff. Bitcoin')
    _ = ax2.set_ylabel("Diff. Bitcoin Value (Dollar)")

    lns = lns1 + lns2
    labs = [l.get_label() for l in lns]
    _ = ax1.legend(lns, labs, loc=3)
    _ = ax1.set_xlim(pd.Timestamp(from_date), pd.Timestamp(to_date))

    if save_to is not None:
        _ = plt.savefig(save_to, format='svg')
    _ = plt.show()

In [None]:
# plot weighted sentiment vs diff. bitcoin value
plot_sent_vs_bit('2020-01-01', '2021-12-31', 5)

In [None]:
# plot weighted sentiment vs diff. bitcoin value (2021 only)
plot_sent_vs_bit('2021-01-01', '2021-12-31', 5)