# Insert the libraries

In [2]:
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
import pandas as pd
from scipy import integrate
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import logging



# Download price data, tweets and tweets metadata

In [3]:
# download the data

prices_df = pd.read_csv("Downloads/pricesfeb-apr15_usdt.csv", low_memory=True)

In [4]:
# download the data

#prices_df = pd.read_csv("Downloads/pricesfeb-apr15_usdt.csv", low_memory=True)

tweets_metadata_df = pd.read_csv("Downloads/tweets_metadata10052021.csv")

tweets_df = pd.read_csv("Downloads/tweets_filled_gaps.csv", low_memory=True)

# Functions for computing vader score

In [14]:
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import logging

logger = logging.getLogger()
sent_analyzer = None


# Text Cleaning Methods
# Source: https://medium.com/@datamonsters/text-preprocessing-in-python-steps-tools-and-examples-bf025f872908
# To-Lower-Case-method not suitable --> Vader needs cap letters for score balancing
# Remove-Punctuation-method not suitable --> Vader needs punctuation for score balancing
# Tokenization etc. not suitable --> Vader needs sentence structure for score balancing

def replace_url(text):
    """ Replaces url address with "url" """
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text


def replace_at_user(text):
    """ Replaces "@user" with "atUser" """
    text = re.sub('@[^\s]+', 'atUser', text)
    return text


def remove_unicode(text):
    """ Removes unicode from tweet """
    text = re.sub(r'(\\u[0-9A-Fa-f]+)', r'', text)
    text = re.sub(r'[^\x00-\x7f]', r'', text)
    return text


def remove_hashtag_in_front_of_word(text):
    """ Removes hastag-sign in front of a word """
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text


def leave_hash_tag_only(text):
    """ Leaves only hashtags that are present in tweet """
    text = re.findall(r"#(\w+)", text)
    return text


def remove_numbers(text):
    """ Removes numerical chars """
    text = ''.join([i for i in text if not i.isdigit()])
    return text


def custom_stop_words(text):
    """ Custom stop-words to be removal without changing DTYPE"""
    text = re.sub(r'url', r'', text)
    text = re.sub(r'nan', r'', text)
    text = re.sub(r'\n', r'', text)
    return text


def init_sent_model():
    crypto_words = get_vader_dict()
    analyzer = SentimentIntensityAnalyzer()
    analyzer.lexicon.update(crypto_words)
    return analyzer


def get_vader_dict():
    crypto_words = {}
    with open("vader_config.txt", "r", encoding="utf-8") as f:
        for line in f:
            (key, val) = line.split(':')
            crypto_words[key] = float(val)

    return crypto_words


# VADER sentiment analysis
def vader_sentiment_score(text):
    """ VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text """
    """ source: https://github.com/cjhutto/vaderSentiment """
    global sent_analyzer
    if sent_analyzer is None:
        sent_analyzer = init_sent_model()
    detail_score = sent_analyzer.polarity_scores(text)
    score = detail_score["compound"]
    return detail_score, score


def calc_sentiment(text):
    # Preprocess tweet / message
    #text = remove_unicode(text) no need if we consider emoticons
    text = replace_url(text)
    text = replace_at_user(text)
    text = remove_hashtag_in_front_of_word(text)
    text = remove_numbers(text)
    text = custom_stop_words(text)

    # Score message with Vader-Sentiment
    detail_score, score = vader_sentiment_score(text)

    return score





# Calculate the different sentiment score functions

In [187]:
# function that finds the sum of the score column from the tweet_df per hour (unsing only the vader_score)

def aggr_score(tweets_df):
    tweets_df["created_at"] =pd.to_datetime(tweets_df["created_at"], utc=True)
    tweets_df["created_at"]= tweets_df["created_at"].astype(str)
    tweets_df["created_at"] =pd.to_datetime(tweets_df["created_at"].str[:-6])
    tweets_df2 = tweets_df[tweets_df["created_at"].dt.year==2021]
    h =  tweets_df2[["created_at", "score"]].groupby(pd.Grouper(key='created_at',freq='H')).score.sum()
    return h

# function that finds the sum of only the negative score column from the tweet_df per hour (unsing only the vader_score)

def aggr_neg_score(tweets_df):
    tweets_df["created_at"] =pd.to_datetime(tweets_df["created_at"], utc=True)
    tweets_df["created_at"]= tweets_df["created_at"].astype(str)
    tweets_df["created_at"] =pd.to_datetime(tweets_df["created_at"].str[:-6])
    tweets_df2 = tweets_df[tweets_df["created_at"].dt.year==2021]
    h2 = tweets_df2.loc[(tweets_df2['score'] < 0 )].groupby(pd.Grouper(key='created_at',freq='H')).score.sum() 
    return h2

# function that find the tweets with metadata and created a unified dataframe both with the text from tweets and the metadata

def tweets_with_metadata(tweets_df, tweets_metadata_df):
    tweets_metadata_df2 = tweets_metadata_df.sort_values('fetched_at').groupby('id').tail(1)
    tweets_df_merge = pd.merge(tweets_df, tweets_metadata_df2[["id", "like_count", "reply_count", "retweet_count", "quote_count"]], on='id', how = "left")
    return tweets_df_merge



#function that includes likes, retweets and quotes in the calculation of the sentiment score
#that adds in the new column score2

def sent_score_metadata(tweets_df_merge):
    tweets_df_merge["score2"] =np.where(((tweets_df_merge.like_count.notna())|(tweets_df_merge.retweet_count.notna())|(tweets_df_merge.quote_count.notna())), (tweets_df_merge.score*((tweets_df_merge.like_count)+(tweets_df_merge.retweet_count)+(tweets_df_merge.quote_count))),tweets_df_merge.score)
    return tweets_df_merge["score2"]


# function that finds the sum of the sent_score_metadata function per hour
def aggr_score_metadata(tweets_df_merge):
    tweets_df_merge["score2"] = sent_score_metadata(tweets_df_merge)
    tweets_df_merge["created_at"] =pd.to_datetime(tweets_df_merge["created_at"], utc=True)
    tweets_df_merge["created_at"]= tweets_df_merge["created_at"].astype(str)
    tweets_df_merge["created_at"] =pd.to_datetime(tweets_df_merge["created_at"].str[:-6])
    tweets_df_merge = tweets_df_merge[tweets_df_merge["created_at"].dt.year==2021]
    h8 =  tweets_df_merge.groupby(pd.Grouper(key='created_at',freq='H')).score2.sum()
    return h8

# function that uses tweets_with_metadata, sent_score_metadata, aggr_score_metadata functions
def final_sentiment_score_with_metadata(tweets_df, tweets_metadata_df):
    tweets_df_merge =tweets_with_metadata(tweets_df, tweets_metadata_df)
    tweets_df_merge[score2] =sent_score_metadata(tweets_df_merge)
    h8 = aggr_score_metadata(tweets_df_merge)
    return h8
    
# function that changes the name of created_at(datetime) column of tweets_df to E, so we are compatible with price data
# and resets index
def process_score(x1, score):
    x1 =x1.reset_index()
    x1 =x1.rename(columns={'created_at':'E'})
    x1.reset_index(drop=True, inplace=True)
    x1b = x1[["E", score]]
    x1b= x1b.set_index("E")
    return x1
    

In [195]:
#h = aggr_score(tweets_df)
#h2 = aggr_neg_score(tweets_df)
#
#h8 = aggr_score_metadata(tweets_df_merge)
#h_new = process_score(h8)

# calculate the sentiment score in case that we don't use any metadata
def sentiment_score_calc(function, tweets_df, score):
    h = function(tweets_df)
    h_new = process_score(h, score)
    x1b_diff= time_series_stationary(h_new,"score")
    
        
    return h_new

#def final_sentiment_score_with_metadata(tweets_df, tweets_metadata_df):
#    tweets_df_merge =tweets_with_metadata(tweets_df, tweets_metadata_df)
#    tweets_df_merge["score"] =sent_score_metadata(tweets_df_merge)
#    h8 = aggr_score_metadata(tweets_df_merge)
#    h_new = process_score(h8)
#    return h_new

In [32]:
#h = sentiment_score_calc(aggr_neg_score, tweets_df)

In [33]:
#h2 = final_sentiment_score_with_metadata(tweets_df, tweets_metadata_df)

# Working with Price Data

In [222]:
# this function chooses the cryptocurrencies lis performs some cleaning on the price data, like removing data earlier than 2021, and create columns first, min, max, last prices per hour
def process_prices(prices_df,coins):

    prices_df["E"]=(pd.to_datetime(prices_df["E"],unit='ms')) 
    prices_df.set_index("E")
    h = prices_df[prices_df["s"].isin(coins)]
   
    h = h[h["E"].dt.year==2021]
    y =h.groupby([pd.Grouper(key ='E', freq='H'), "s"]).agg({'c': ['first', 'min', 'max', 'last']})

    # rename columns
    y.columns = ['c_first', 'c_min', 'c_max', 'c_last']

    # reset index to get grouped columns back
    y = y.reset_index()
    return y

# this function select a specific coin
def select_coin(y, coin_name):

    y1 = y[y["s"] == coin_name]
    y1b = y1[["E", "c_last"]]
    y1b= y1b.set_index("E")
    return y1b

# check the stationarity of the cryptocurrency time series

def adafuller(time_series):
#h_diff = np.diff(h_log['score'])
#h2 =  h2.groupby(pd.Grouper(freq='H')).score.sum()
#h_diff = pd.DataFrame(h_diff, columns=['score'])
#h1a_log2 = h1a_log.replace([np.inf, -np.inf], np.nan).dropna()
    result = adfuller(time_series)
    #print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    
    #if result[1]<=0.05:
    #    print('Time Series: stationary')
    #else:
    #    print('Time Series: non-stationary')    
    return result[1]



# Fractional differencing

In [None]:
# function for finding thr weights
    # return the weights from the series expansion of the differencing operator
    # for real orders d and up to lags coefficients
def getWeights(d,lags):
    w=[1]
    for k in range(1,lags):
        w.append(-w[-1]*((d-k+1))/k)
    w=np.array(w).reshape(-1,1) 
    return w
def plotWeights(dRange, lags, numberPlots):
    weights=pd.DataFrame(np.zeros((lags, numberPlots)))
    interval=np.linspace(dRange[0],dRange[1],numberPlots)
    for i, diff_order in enumerate(interval):
        weights[i]=getWeights(diff_order,lags)
    weights.columns = [round(x,2) for x in interval]
    fig=weights.plot(figsize=(15,6))
    plt.legend(title='Order of differencing')
    plt.title('Lag coefficients for various orders of differencing')
    plt.xlabel('lag coefficients')
    #plt.grid(False)
    plt.show()
    
# return the time series resulting from (fractional) differencing
    # for real orders order up to lag_cutoff coefficients
        
def ts_differencing(series, order, lag_cutoff):

    
    weights=getWeights(order, lag_cutoff)
    res=0
    for k in range(lag_cutoff):
        res += weights[k]*series.shift(k).fillna(0)
    return res[lag_cutoff:] 

def cutoff_find(order,cutoff,start_lags): #order is our dearest d, cutoff is 1e-3 for us, and start lags is an initial amount of lags in which the loop will start, this can be set to high values in order to speed up the algo
    val=np.inf
    lags=start_lags
    while abs(val)>cutoff:
        w=getWeights(order, lags)
        val=w[len(w)-1]
        lags+=1
    return lags

def ts_differencing_tau(series, order, tau):
    # return the time series resulting from (fractional) differencing
    lag_cutoff=(cutoff_find(order,tau,1)) #finding lag cutoff with tau
    weights=getWeights(order, lag_cutoff)
    res=0
    for k in range(lag_cutoff):
        res += weights[k]*series.shift(k).fillna(0)
    return res[lag_cutoff:] 

# function that find the d order of the functional differencing

def find_fractional_d(y1b):
    
    #this part takes about 20 minutes to compute
    possible_d=np.divide(range(1,100),100)
    #print(possible_d)
    tau=1e-3
    original_adf_stat_holder=[None]*len(possible_d)
    log_adf_stat_holder=[None]*len(possible_d)

    for i in range(len(possible_d)):
        original_adf_stat_holder[i]=adfuller(ts_differencing_tau(y1b,possible_d[i],tau))[1]
        if original_adf_stat_holder[i]<=0.05:
            print(possible_d[i], original_adf_stat_holder[i])
            break;
            
    return possible_d[i]   

#here we get 1e-3 as default, make the time series stationary
def time_series_stationary(time_series,col):
    if adafuller(time_series[col])>=0.05:
        time_series[col] = ts_differencing_tau(time_series[col], find_fractional_d(time_series[col]), 1e-3)
    return time_series



# Apply linear regression on random samples for evaluating the sentiment score relevance to the prices data

In [None]:
# function that finds the lagged values for the sentiment score
def create_lagged_values_df(time_series, lag_num, score):
    for i in range(1, lag_num):
        time_series["score_lag"+str(i)]=time_series[score].shift(i)
      #  print(time_series)
    return(time_series)


# this function get a random sample from our prices and sentiment time series

def get_random_sample(prices_ts, fract, sentiment_ts):
    result2 = prices_ts.copy()
    
    prices_ts_sample=result2.sample(frac=fract)
    price_merge_sent = pd.merge(prices_ts_sample, sentiment_ts, on='E', how = "inner")
    price_merge_sent = price_merge_sent.dropna()
   # print( price_merge_sent)
    return price_merge_sent


# Split the data in train and test and perform normalisation in our data
def split_the_data(price_merge_sent, nobs):
#nobs = 100
    MINMAX_NORMALIZER = MinMaxScaler(feature_range=(-1, 1))
    price_merge_sent_train, price_merge_sent_test = price_merge_sent[0:-nobs], price_merge_sent[-nobs:]
   # print(price_merge_sent_train, price_merge_sent_test)
# Check size
#print(price_merge_sent_train.shape)  # (119, 8)
#print(price_merge_sent_test.shape)  # (4, 8)
    price_merge_sent_train_scaled = MINMAX_NORMALIZER.fit_transform(price_merge_sent_train.iloc[:,1:])
                                                        # store the results in a data frame
    price_merge_sent_train = pd.DataFrame(price_merge_sent_train_scaled, columns=price_merge_sent_train.iloc[:,1:].columns)
    price_merge_sent_test_scaled = MINMAX_NORMALIZER.fit_transform(price_merge_sent_test.iloc[:,1:])
    price_merge_sent_test = pd.DataFrame(price_merge_sent_test_scaled, columns=price_merge_sent_test.iloc[:,1:].columns)


    return price_merge_sent_train, price_merge_sent_test



def linear_regression(price_merge_sent_train, price_merge_sent_test):
    regr = linear_model.LinearRegression()

# Train the model using the training sets
    regr.fit(price_merge_sent_train.iloc[:,2:], price_merge_sent_train.iloc[:,1])
    print(price_merge_sent_train.iloc[:,2:], price_merge_sent_train.iloc[:,1])


# Make predictions using the testing set
    prices_y_pred = regr.predict(price_merge_sent_test.iloc[:,2:])

# The coefficients
    print('Coefficients: \n', regr.coef_)
# The mean squared error
    print('Mean squared error: %.2f'
      % mean_squared_error(price_merge_sent_test.iloc[:,1], prices_y_pred))
# The coefficient of determination: 1 is perfect prediction
    print('Coefficient of determination: %.2f'
      % r2_score(price_merge_sent_test.iloc[:,1], prices_y_pred))
    return prices_y_pred, mean_squared_error, r2_score


# this function performs linear regression on the random sampled price data
def linear_regression1(price_merge_sent_train, price_merge_sent_test):
    model1=sm.OLS(endog=price_merge_sent_train.iloc[:, 1],exog=price_merge_sent_train.iloc[:, 2:])
    results1=model1.fit()
    predictions = results1.predict(price_merge_sent_test.iloc[:,2:])
    return results1.summary()

In [173]:
# function that does some preprocessing on price data like cleaning the specific coin and make the time series stationary

def preprocess_coin(prices_df, coins, coin_name, col):
    y = process_prices(prices_df,coins)
    y = select_coin(y, coin_name)
    y = time_series_stationary(y,col)
    
    return y


In [None]:
#y1_diff = preprocess_coin(prices_df, ["BTCUSDT", "ETHUSDT", "ADAUSDT", "BNBUSDT", "USDTUSDT", "DOTUSDT", "XRPUSDT", "LTCUSDT", "LINKUSDT", "BCHUSDT", "XLMUSDT"], "BTCUSDT", "c_last")

In [223]:
# calculate the sentiment score without metadata
def sentiment_score_calc(function, tweets_df, lag_num, score):
    h = function(tweets_df)
    h_new = process_score(h, score)
    x1b_diff= time_series_stationary(h_new, "score")
    x1b_with_lags = create_lagged_values_df(x1b_diff, lag_num, score)
    return x1b_with_lags


# calculate the sentiment score with metadata
def final_sentiment_score_with_metadata(tweets_df, tweets_metadata_df, lag_num, score2):
    tweets_df_merge =tweets_with_metadata(tweets_df, tweets_metadata_df)
    tweets_df_merge["score2"] =sent_score_metadata(tweets_df_merge)
    h8 = aggr_score_metadata(tweets_df_merge)
    h_new = process_score(h8, score2)
    x1b_diff= time_series_stationary(h_new, "score2")
    x1b_with_lags = create_lagged_values_df(x1b_diff, lag_num,score2)
    return x1b_with_lags
    


In [53]:
#x1b_with_lags = sentiment_score_calc(aggr_score, tweets_df, 3)
#x1b_with_lags2 = final_sentiment_score_with_metadata(tweets_df, tweets_metadata_df,3)

p-value: 0.024918
p-value: 0.000000


In [224]:
# the function for evaluating with linear regression

def evaluation_sentiment(prices_ts, sentiment_ts, fract, nobs):
    price_merge_sent = get_random_sample(prices_ts, fract, sentiment_ts)
    price_merge_sent_train, price_merge_sent_test = split_the_data(price_merge_sent, nobs)
    results = linear_regression1(price_merge_sent_train, price_merge_sent_test)
    return results

In [None]:
#evaluation_sentiment(y1_diff, x1b_with_lags, 0.6, 100)

In [226]:
# the function that runs the full pipeline with sentiment score without metadata
def sent_price_eval1(prices_df,coins, coin_name, col, function, tweets_df, lag_num, fract, nobs, score):
    y1_diff=preprocess_coin(prices_df, coins, coin_name, col)
    x1b_with_lags = sentiment_score_calc(function, tweets_df, lag_num, score)
    results = evaluation_sentiment(y1_diff, x1b_with_lags, fract, nobs)
    return results

In [227]:
# the function that runs the full pipeline with sentiment score with metadata
def sent_price_eval2(prices_df,coins, coin_name, col, tweets_df,tweets_metadata_df, lag_num, fract, nobs, score2):
    y1_diff=preprocess_coin(prices_df, coins, coin_name, col)
    x1b_with_lags = final_sentiment_score_with_metadata(tweets_df, tweets_metadata_df, lag_num, score2)
    results = evaluation_sentiment(y1_diff, x1b_with_lags, fract, nobs)
    return results

In [230]:
#calling the function price)eval1
sent_price_eval1(prices_df, ["BTCUSDT", "ETHUSDT", "ADAUSDT", "BNBUSDT", "USDTUSDT", "DOTUSDT", "XRPUSDT", "LTCUSDT", "LINKUSDT", "BCHUSDT", "XLMUSDT"], "XLMUSDT", "c_last", aggr_neg_score, tweets_df, 4, 0.6, 100, "score")

p-value: 0.678634
0.35 0.04852813177069784
p-value: 0.020131


0,1,2,3
Dep. Variable:,score,R-squared (uncentered):,0.945
Model:,OLS,Adj. R-squared (uncentered):,0.945
Method:,Least Squares,F-statistic:,3617.0
Date:,"Sun, 13 Jun 2021",Prob (F-statistic):,0.0
Time:,22:03:30,Log-Likelihood:,201.46
No. Observations:,631,AIC:,-396.9
Df Residuals:,628,BIC:,-383.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
score_lag1,1.2463,0.058,21.379,0.000,1.132,1.361
score_lag2,-0.5231,0.050,-10.391,0.000,-0.622,-0.424
score_lag3,0.2043,0.049,4.173,0.000,0.108,0.300

0,1,2,3
Omnibus:,244.65,Durbin-Watson:,1.964
Prob(Omnibus):,0.0,Jarque-Bera (JB):,33905.907
Skew:,0.593,Prob(JB):,0.0
Kurtosis:,38.892,Cond. No.,14.8


In [231]:
#calling the function price_eval2
sent_price_eval2(prices_df, ["BTCUSDT", "ETHUSDT", "ADAUSDT", "BNBUSDT", "USDTUSDT", "DOTUSDT", "XRPUSDT", "LTCUSDT", "LINKUSDT", "BCHUSDT", "XLMUSDT"], "XLMUSDT", "c_last", tweets_df, tweets_metadata_df, 4, 0.6, 100, "score2")

p-value: 0.678634
0.35 0.04852813177069784
p-value: 0.000000


0,1,2,3
Dep. Variable:,score2,R-squared (uncentered):,0.606
Model:,OLS,Adj. R-squared (uncentered):,0.604
Method:,Least Squares,F-statistic:,319.8
Date:,"Sun, 13 Jun 2021",Prob (F-statistic):,1.2500000000000002e-125
Time:,22:08:08,Log-Likelihood:,709.23
No. Observations:,626,AIC:,-1412.0
Df Residuals:,623,BIC:,-1399.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
score_lag1,0.1419,0.037,3.791,0.000,0.068,0.215
score_lag2,0.0764,0.043,1.764,0.078,-0.009,0.161
score_lag3,0.1649,0.010,16.910,0.000,0.146,0.184

0,1,2,3
Omnibus:,632.494,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,285290.359
Skew:,3.697,Prob(JB):,0.0
Kurtosis:,107.321,Cond. No.,7.34
