<h1>Sentiment extraction from Topic data</h1>
<h3>This notebook is used to conduct sentiment score preparation</h3>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
import yfinance as yh
import matplotlib.pyplot as plt
import copy

In [None]:
data1 = "./data/archive/the-reddit-ethereum-dataset-posts.csv"
df1 = pd.read_csv(data1)
df1 = df1[['created_utc', 'title']]
df1_h = copy.deepcopy(df1)

df1

Unnamed: 0,created_utc,title
0,1635811168,Ethereum Extasy SOMEONE PLEASE SOLVE THIS FOR ...
1,1635810613,Anyone have any insight on the Davis Schwartz ...
2,1635810303,How And Where To Buy Ethereum Meta (ETHM) - St...
3,1635810172,Ethereum Name Service Is Launching a Governanc...
4,1635810155,"Red Laser Eyes for BTC, what about Ethereum?"
...,...,...
479255,1388799771,Welcome to Ethereum!
479256,1388343026,Ethereum: The Ultimate Smart Contract and Dece...
479257,1382296873,Welcome to Ethereum (or Etherium)
479258,1354285750,[Cloth] Elfling in a full priest t3 set (origi...


<h2>Convert timestamps from utc to understandable format</h2>

In [27]:
#daily observations

df1['created_utc'] = pd.to_datetime(df1['created_utc'],unit='s')
df1['created_utc'] = pd.to_datetime(df1['created_utc']).dt.date
df1 = df1.sort_values(by='created_utc', ascending=True)

print(df1.head())
print(df1.dtypes)

       created_utc                                              title
479259  2011-07-03  [Ethereum Prison Key] what was that again? oh ...
479258  2012-11-30  [Cloth] Elfling in a full priest t3 set (origi...
479257  2013-10-20                  Welcome to Ethereum (or Etherium)
479256  2013-12-29  Ethereum: The Ultimate Smart Contract and Dece...
479255  2014-01-04                               Welcome to Ethereum!
created_utc    object
title          object
dtype: object


In [28]:
#hourly

df1_h['created_utc'] = pd.to_datetime(df1_h['created_utc'],unit='s')
df1_h['created_utc'] = pd.to_datetime(df1_h['created_utc']).dt.strftime(r'%Y-%m-%d-%H')
df1_h = df1_h.sort_values(by='created_utc', ascending=True)

print(df1_h.dtypes)

created_utc    object
title          object
dtype: object


In [29]:
print(df1_h.tail(8))

      created_utc                                              title
19  2021-11-01-23  Got a question about blockchains for a collect...
21  2021-11-01-23  Over the weekend I became the first content cr...
22  2021-11-01-23  Institutional Investors Are Pouring Capital In...
23  2021-11-01-23  Burger King, Robinhood serving up 'side of cry...
24  2021-11-01-23  What is the difference between Layer 1 and Lay...
25  2021-11-01-23  Day of the Dead NFTs -- utilizes Ethereum &amp...
13  2021-11-01-23                   ETHEREUM GIVEAWAYS. UPTO $50,000
0   2021-11-01-23  Ethereum Extasy SOMEONE PLEASE SOLVE THIS FOR ...


In [30]:
df1['created_utc'] = pd.to_datetime(df1['created_utc'])
df1_h['created_utc'] = pd.to_datetime(df1_h['created_utc'])

In [31]:
df1['created_utc'].dtype

dtype('<M8[ns]')

<h2>Masking time series</h2>

In [32]:
start_date = '2021-01-01'
end_date = '2022-01-01'

mask = (df1['created_utc'] >= start_date) & (df1['created_utc'] < end_date)
mask_2 = (df1_h['created_utc'] >= start_date) & (df1_h['created_utc'] < end_date)

df1 = df1.loc[mask]
df1_h = df1_h.loc[mask_2]

In [33]:
df1.head(5)

Unnamed: 0,created_utc,title
159046,2021-01-01,Superb project and I have been working with th...
159045,2021-01-01,Doubt Ethereum 2.0 and ERC20 Protocol
159044,2021-01-01,ANALISE BITCOIN &amp; ETHEREUM - FELIZ ANO NOV...
159043,2021-01-01,Zumo free £10 with of Ethereum
159042,2021-01-01,Doubt Ethereum 2.0 and ERC20 Protocol


In [34]:
df1_h.head(5)

Unnamed: 0,created_utc,title
159136,2021-01-01 00:00:00,[ CryptoCurrency ] Stratis InterFlux Release -...
159135,2021-01-01 00:00:00,"Happy New Year Of Bitcoin, Ethereum, Alts, DeF..."
159134,2021-01-01 00:00:00,Could having to pay gas fees prevent mass adop...
159123,2021-01-01 01:00:00,Ethereum Adopts Enjin’s ERC-1155 Token Standar...
159124,2021-01-01 01:00:00,Help please with Ethereum on ARM setup


<h2>Second dataset</h2>

In [35]:
data2 = "./archive/the-reddit-ethereum-dataset-comments.csv"
df2 = pd.read_csv(data2)
df2 = df2[['created_utc', 'body']]
df2 = df2.rename(columns={"body" : "title"})
df2_h = copy.deepcopy(df2)

df2

Unnamed: 0,created_utc,title
0,1635811192,Surely you must be aware of network effect.\n\...
1,1635811154,"For real man, the gas fees are more than the t..."
2,1635811138,What happens with bitcoin has implications for...
3,1635811120,Not exactly. Ethereum addresses that made dep...
4,1635811036,Is the babydoge on the ethereum network the sa...
...,...,...
1132326,1314463399,Excellent list. So many awesome staves I wish ...
1132327,1314435612,[Frostscythe of Lord Ahune](http://www.wowhead...
1132328,1309736431,I'm guessing since the doing away with of keyr...
1132329,1308069759,I'd love to see more group quests and top-leve...


daily

In [36]:
df2['created_utc'] = pd.to_datetime(df2['created_utc'], unit='s')
df2['created_utc'] = pd.to_datetime(df2['created_utc']).dt.date
df2 = df2.sort_values(by='created_utc', ascending=True)

print(df2.head())
print(df2.dtypes)

        created_utc                                              title
1132330  2011-05-25  They are removing most of the keys.\n\n\n&gt;A...
1132329  2011-06-14  I'd love to see more group quests and top-leve...
1132328  2011-07-03  I'm guessing since the doing away with of keyr...
1132327  2011-08-27  [Frostscythe of Lord Ahune](http://www.wowhead...
1132326  2011-08-27  Excellent list. So many awesome staves I wish ...
created_utc    object
title          object
dtype: object


In [37]:
df2['created_utc'] = pd.to_datetime(df2['created_utc'])

In [38]:
start_date = '2021-01-01'
end_date = '2022-01-01'


mask = (df2['created_utc'] >= start_date) & (df2['created_utc'] < end_date)

df2.loc[mask]

Unnamed: 0,created_utc,title
430361,2021-01-01,Very interesting story! I also want to pay a...
430359,2021-01-01,"It's all about fundamentals, and your time pre..."
430362,2021-01-01,Boas :)\nSão investimentos mais voláteis mas a...
430363,2021-01-01,"Ethereum can’t get their shit together, proof ..."
430364,2021-01-01,"Happy New Year, everybody!\n\nLots to report h..."
...,...,...
1064,2021-11-01,That's because it was a coinbase internal tran...
1065,2021-11-01,"IOTA zero rates.\nEthereum rates are up 2,300%..."
1066,2021-11-01,Yes it is. Those Ethereum are never going to g...
1068,2021-11-01,"RGB seems like it's the most ""maxi"", but it is..."


hourly

In [39]:
df2_h = copy.deepcopy(df2)

df2_h['created_utc'] = pd.to_datetime(df2_h['created_utc'], unit='s')
df2_h['created_utc'] = pd.to_datetime(df2_h['created_utc']).dt.strftime(r'%Y-%m-%d-%H')
df2_h = df2_h.sort_values(by='created_utc', ascending=True)

print(df2_h.head())
print(df2_h.dtypes)

           created_utc                                              title
1132330  2011-05-25-00  They are removing most of the keys.\n\n\n&gt;A...
1132329  2011-06-14-00  I'd love to see more group quests and top-leve...
1132328  2011-07-03-00  I'm guessing since the doing away with of keyr...
1132327  2011-08-27-00  [Frostscythe of Lord Ahune](http://www.wowhead...
1132326  2011-08-27-00  Excellent list. So many awesome staves I wish ...
created_utc    object
title          object
dtype: object


In [40]:
mask_2 = (df2_h['created_utc'] >= start_date) & (df2_h['created_utc'] < end_date)

df2_h.loc[mask_2]

Unnamed: 0,created_utc,title
430635,2021-01-01-00,cryptocurrency is absolutely valuable as a con...
430636,2021-01-01-00,Rollups will make dapp transactions free on Et...
430433,2021-01-01-00,Are there cheaper ways to send money? Definite...
430633,2021-01-01-00,"Oh boy, here you are again spouting baseless, ..."
430632,2021-01-01-00,But in the meantime you're just sitting there ...
...,...,...
4,2021-11-01-00,Is the babydoge on the ethereum network the sa...
3,2021-11-01-00,Not exactly. Ethereum addresses that made dep...
2,2021-11-01-00,What happens with bitcoin has implications for...
22,2021-11-01-00,Bitcoin and ethereum


In [41]:
#final daily observations

df3 = pd.concat([df1, df2], sort=False)
df3

Unnamed: 0,created_utc,title
159046,2021-01-01,Superb project and I have been working with th...
159045,2021-01-01,Doubt Ethereum 2.0 and ERC20 Protocol
159044,2021-01-01,ANALISE BITCOIN &amp; ETHEREUM - FELIZ ANO NOV...
159043,2021-01-01,Zumo free £10 with of Ethereum
159042,2021-01-01,Doubt Ethereum 2.0 and ERC20 Protocol
...,...,...
1064,2021-11-01,That's because it was a coinbase internal tran...
1065,2021-11-01,"IOTA zero rates.\nEthereum rates are up 2,300%..."
1066,2021-11-01,Yes it is. Those Ethereum are never going to g...
1068,2021-11-01,"RGB seems like it's the most ""maxi"", but it is..."


In [42]:
#final hourly observations

df3_h = pd.concat([df1_h, df2_h], sort=False)
df3_h

Unnamed: 0,created_utc,title
159136,2021-01-01 00:00:00,[ CryptoCurrency ] Stratis InterFlux Release -...
159135,2021-01-01 00:00:00,"Happy New Year Of Bitcoin, Ethereum, Alts, DeF..."
159134,2021-01-01 00:00:00,Could having to pay gas fees prevent mass adop...
159123,2021-01-01 01:00:00,Ethereum Adopts Enjin’s ERC-1155 Token Standar...
159124,2021-01-01 01:00:00,Help please with Ethereum on ARM setup
...,...,...
4,2021-11-01-00,Is the babydoge on the ethereum network the sa...
3,2021-11-01-00,Not exactly. Ethereum addresses that made dep...
2,2021-11-01-00,What happens with bitcoin has implications for...
22,2021-11-01-00,Bitcoin and ethereum


In [None]:
print(df3.dtypes)
df3.describe()

Unnamed: 0,created_utc
count,1291468
mean,2019-09-09 10:49:44.257914368
min,2011-05-25 00:00:00
25%,2017-12-28 00:00:00
50%,2020-06-29 00:00:00
75%,2021-05-18 00:00:00
max,2021-11-01 00:00:00


In [45]:
df3 = df3.rename(columns={"created_utc" : "publication date", "title" : "post"})
df3_h = df3_h.rename(columns={"created_utc" : "publication date", "title" : "post"})

df3.head(5)
df3_h.head(5)

Unnamed: 0,publication date,post
159136,2021-01-01 00:00:00,[ CryptoCurrency ] Stratis InterFlux Release -...
159135,2021-01-01 00:00:00,"Happy New Year Of Bitcoin, Ethereum, Alts, DeF..."
159134,2021-01-01 00:00:00,Could having to pay gas fees prevent mass adop...
159123,2021-01-01 01:00:00,Ethereum Adopts Enjin’s ERC-1155 Token Standar...
159124,2021-01-01 01:00:00,Help please with Ethereum on ARM setup


<h2>Calculating sentiment scores</h2>

In [None]:
import numpy as np 
%matplotlib inline 

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

df3.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\X\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
def print_sentiment_scores(sentence):
    snt = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(snt)))

arr = []
arr_h = []

i = 0
"""  Leave it be 
while (i<len(df3)):

    k = analyser.polarity_scores(df3.iloc[i]['post'])
    arr.append(k['compound'])
    
    i = i+1
"""
i = 0

while (i<len(df3_h)):

    k = analyser.polarity_scores(df3_h.iloc[i]['post'])
    arr_h.append(k['compound'])
    
    i = i+1

In [73]:
print('Size of daily dataset:', df3.shape, len(arr))
print('Size of hourly dataset:', df3_h.shape, len(arr_h))

Size of daily dataset: (305, 3) 0
Size of hourly dataset: (1291468, 2) 1291468


In [75]:
df3['VADER score'] = arr
df3_h['VADER score'] = arr_h

df3.head(5)

Unnamed: 0,publication date,mean_VADER,trade_sign
2573,2021-01-01,0.041249,1
2574,2021-01-02,0.107192,1
2575,2021-01-03,-0.386431,-1
2576,2021-01-04,-0.602764,-1
2577,2021-01-05,-0.070846,-1


In [None]:
df3 = df3[df3['VADER score'] != 0]
df3 = df3.groupby('publication date') \
       .agg(mean_VADER=('VADER score', 'mean')) \
       .reset_index()

df3_h = df3_h[df3_h['VADER score'] != 0]
df3_h = df3_h.groupby('publication date') \
       .agg(mean_VADER=('VADER score', 'mean')) \
       .reset_index()
df3_h

Unnamed: 0,publication date,mean_VADER
0,2021-01-01 00:00:00,0.267100
1,2021-01-01 01:00:00,0.419967
2,2021-01-01 02:00:00,0.680800
3,2021-01-01 04:00:00,0.488271
4,2021-01-01 05:00:00,0.340000
...,...,...
10141,2021-10-28-00,0.349099
10142,2021-10-29-00,0.388504
10143,2021-10-30-00,0.336406
10144,2021-10-31-00,0.357086


In [None]:
df3['mean_VADER'] = (df3['mean_VADER'] - df3['mean_VADER'].mean()) / df3['mean_VADER'].std() 
df3_h['mean_VADER'] = (df3_h['mean_VADER'] - df3_h['mean_VADER'].mean()) / df3_h['mean_VADER'].std() 

In [None]:
df3.rename({'sentiment': 'mean_VADER'})
df3_h.rename({'sentiment': 'mean_VADER'})

Unnamed: 0,publication date,mean_VADER
0,2021-01-01 00:00:00,-0.122054
1,2021-01-01 01:00:00,0.619884
2,2021-01-01 02:00:00,1.885840
3,2021-01-01 04:00:00,0.951402
4,2021-01-01 05:00:00,0.231766
...,...,...
10141,2021-10-28-00,0.275930
10142,2021-10-29-00,0.467179
10143,2021-10-30-00,0.214325
10144,2021-10-31-00,0.314691


<h3>Deep copy to fix the existing issue (applied once)</h3>

In [87]:
df3_h_c = copy.deepcopy(df3_h)

In [93]:
df3_h_c['publication date'] = pd.to_datetime(df3_h_c['publication date'],unit='s')
df3_h_c['publication date'] = pd.to_datetime(df3_h_c['publication date']).dt.strftime(r'%Y-%m-%d-%H')
df3_h_c = df3_h_c.sort_values(by='publication date', ascending=True)

print(df3_h_c.dtypes)

publication date     object
mean_VADER          float64
trade_sign            int64
dtype: object


Sentiment conversion - we turn negative sentiment to -1 and positive sentiment to 1.

In [None]:
df3['trade_sign'] = (df3['mean_VADER'] > 0) * 1 + (df3['mean_VADER'] < 0) * -1
df3_h['trade_sign'] = (df3_h['mean_VADER'] > 0) * 1 + (df3_h['mean_VADER'] < 0) * -1

In [81]:
df3['trade_sign'].unique()

array([ 1, -1])

In [None]:
start_date = '2021-01-01'
end_date = '2022-01-01'

mask = (df3['publication date'] >= start_date) & (df3['publication date'] < end_date)
#mask_2 = (df3_h_c['publication date'] >= start_date) & (df3_h_c['publication date'] < end_date)

df3 = df3.loc[mask]
#df3_h_c = df3_h_c.loc[mask_2]

In [110]:
df3_h.shape

(10146, 3)

In [None]:
df3.to_csv('./data/sentiment_processed/daily_sentiment.csv', index=False)
df3_h.to_csv('./data/sentiment_processed/hourly_sentiment.csv', index=False)