In [1]:
# imports

import re
import spacy
import en_core_web_sm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from IPython.core.interactiveshell import InteractiveShell
from tqdm.notebook import tqdm
from scipy.ndimage import gaussian_filter1d

In [2]:
# import configurations

sp = spacy.load('en_core_web_sm')
nlp = en_core_web_sm.load()
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [3]:
# read file

df = pd.read_pickle("../data/cleaned/tweets_verified_2020-2021_cleaned.pkl")
display(df.head())

Unnamed: 0,created_at,id,conversation_id,user_id,username,name,tweet,language,mentions,urls,...,retweets_count,likes_count,hashtags,cashtags,link,quote_url,video,thumbnail,reply_to,tweet_clean
0,2020-01-01 00:08:28-05:00,1212239143687741440,1212239143687741440,20646945,dumbfoundead,dumbfoundead.eth,Bitcoin the worst decision i made this decade,en,[],[],...,15,409,[],[],https://twitter.com/dumbfoundead/status/121223...,,False,,[],worst decision made decade
1,2020-01-01 00:54:35-05:00,1212250748815208448,1212250748815208448,636023721,neuroecology,Adam J Calhoun,If you ask someone what they'd do if they went...,en,[],[],...,0,4,[],[],https://twitter.com/neuroecology/status/121225...,,False,,[],ask someone went back ten years use informatio...
2,2020-01-01 02:00:25-05:00,1212267316789952512,1212267316789952512,631810714,cnbctv18news,CNBC-TV18,#bitcoin rally begun in 2013 and it reached a ...,en,[],['https://www.cnbctv18.com/market/currency/bit...,...,0,4,['bitcoin'],[],https://twitter.com/CNBCTV18News/status/121226...,,False,,[],rally begun reached peak
3,2020-01-01 02:18:48-05:00,1212271940502638593,1212271940502638593,14654494,excellion,Samson Mow,Wishing all #Bitcoin Ultra Enthusiasts a very ...,en,"[{'screen_name': 'blockstream', 'name': 'block...",[],...,17,158,"['bitcoin', 'liquidnetwork']",[],https://twitter.com/Excellion/status/121227194...,,True,https://pbs.twimg.com/media/ENLa5toU0AAvhC1.jpg,[],wishing ultra enthusiasts happy new year much ...
4,2020-01-01 02:39:29-05:00,1212277146401402880,1212277146401402880,1066972567943053312,hindustantimes,Hindustan Times,"Breaking down Bitcoin’s 9,000,000% rise in las...",en,[],['http://www.hindustantimes.com/tech/breaking-...,...,0,0,[],[],https://twitter.com/HindustanTimes/status/1212...,,False,,[],breaking rise last decade left skeptics aghast


In [4]:
sent_obj = SentimentIntensityAnalyzer()
sent_data = df['tweet']

pest_words = {
    'up': 2.0,
    'down': -2.0,
    'green' : 2.0,
    'red' : -2.0,
    'bull' : 2.0,
    'bear' : -2.0,
    'buy' : 2.0,
    'bought' : 2.0,
    'sell' : -2.0,
    'sold' : -2.0,
    'moon': 2
}

new_si = SentimentIntensityAnalyzer()
new_si.lexicon.update(pest_words)

sentiment_update = []
for i in tqdm(sent_data):
    sentiment_update.append(sent_obj.polarity_scores(i))

  0%|          | 0/312204 [00:00<?, ?it/s]

In [5]:
sentiment_update[:4]
display(len(sentiment_update))

[{'neg': 0.369, 'neu': 0.631, 'pos': 0.0, 'compound': -0.6249},
 {'neg': 0.0, 'neu': 0.936, 'pos': 0.064, 'compound': 0.4329},
 {'neg': 0.0, 'neu': 0.909, 'pos': 0.091, 'compound': 0.1027},
 {'neg': 0.0, 'neu': 0.705, 'pos': 0.295, 'compound': 0.9482}]

312204

In [6]:
all_compound = [one_tok['compound'] for one_tok in sentiment_update]
df['sentiment'] = pd.DataFrame(all_compound)
display(df[['tweet', 'sentiment']])

Unnamed: 0,tweet,sentiment
0,Bitcoin the worst decision i made this decade,-0.6249
1,If you ask someone what they'd do if they went...,0.4329
2,#bitcoin rally begun in 2013 and it reached a ...,0.1027
3,Wishing all #Bitcoin Ultra Enthusiasts a very ...,0.9482
4,"Breaking down Bitcoin’s 9,000,000% rise in las...",-0.5106
...,...,...
312199,@Bitcoin__art Fair enough 😅😅😅,0.8316
312200,Happy New Year #Bitcoin https://t.co/HdUQNDKUUA,0.5719
312201,"@rubiconcapital_ Buy bitcoin &amp; chill, defu...",0.0000
312202,Bitcoin faces uncertain 2022 after record year...,-0.2960


In [7]:
# export as pickle
df[['id', 'sentiment']].to_pickle("../data/cleaned/by_id_verified_sentiment_2020-2021.pkl")

In [8]:
# try reading pickle
df_test = pd.read_pickle("../data/cleaned/by_id_verified_sentiment_2020-2021.pkl")

display(df_test.dtypes)
display(df_test)

id             int64
sentiment    float64
dtype: object

Unnamed: 0,id,sentiment
0,1212239143687741440,-0.6249
1,1212250748815208448,0.4329
2,1212267316789952512,0.1027
3,1212271940502638593,0.9482
4,1212277146401402880,-0.5106
...,...,...
312199,1477132523826147330,0.8316
312200,1477134677148323841,0.5719
312201,1477137028894670855,0.0000
312202,1477138318982725633,-0.2960
