### Bag of words sentiment analysis on the financial tweets

In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(0)

FNAME = "./data/btc_year_full.csv"

dataset = pd.read_csv(re.sub('.csv','_cleaned.csv', FNAME))
dataset.head()

Unnamed: 0,date,text,likes,retweets,cleaned_text
0,2018-12-01 15:13:48,Mainstream media is screaming that \xe2\x80\x9...,373,106,mainstream media scream bitcoin dead twitter t...
1,2018-12-01 22:59:36,My friends had no problem buying #bitcoin at $...,684,101,friend problem buy bitcoin $7k way $20k panic ...
2,2018-12-01 15:19:30,If you pay your Ohio taxes using #bitcoin then...,518,52,pay ohio tax use bitcoin owe even tax btc incr...
3,2018-12-01 18:57:06,Bearableguy123 screen shot on #Reddit \n\nBina...,100,39,bearableguy123 screen shot reddit binari code ...
4,2018-12-01 13:57:04,$BTC $BTCUSD #Bitcoin \n\nBulls taking profit ...,66,8,$btc $btcusd bitcoin bull take profit everi sm...


**Finding Polarity**

To find the polarity we use the **SentimentIntensityAnalyzer** from **nltk.sentiment.vader**

In [2]:
#nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

cleaned_tweets = dataset['cleaned_text']
for tweet in cleaned_tweets[:3]:
    print('"'+tweet+'"')
    s = sia.polarity_scores(tweet)
    for k in sorted(s):
        print('\t{0}: {1}, '.format(k, s[k]), end='')
        print()
    print()

"mainstream media scream bitcoin dead twitter troll take victori lap perceiv demis ponzi scheme fortun noth could truth see link adopt chart $btc fundament strengthen"
	compound: -0.4019, 
	neg: 0.21, 
	neu: 0.601, 
	pos: 0.189, 

"friend problem buy bitcoin $7k way $20k panic sold loss dare invest $4k someth tell cycl repeat $40k $80k per $btc"
	compound: -0.6249, 
	neg: 0.291, 
	neu: 0.596, 
	pos: 0.112, 

"pay ohio tax use bitcoin owe even tax btc increas valu use btc pay tax pay bad money fiat first keep sound money"
	compound: -0.6908, 
	neg: 0.288, 
	neu: 0.712, 
	pos: 0.0, 



Now, based on the 'compound' polarity score and the knowledge of the data, we can choose which tweet falls in the categories of Positive, Negative and Neutral

In [9]:
def findpolarity(data, neutrality=False):
    sid = SentimentIntensityAnalyzer()
    polarity = sid.polarity_scores(data)
    if neutrality:
        if polarity['compound'] >= 0.2:  
            sentiment = 1
        elif polarity['compound'] <= -0.2:
            sentiment = -1 
        else:
            sentiment = 0
    else:
        return 1 if polarity['compound'] > 0 else -1
    
    return sentiment

*Example*

In [10]:
raw_tweets = dataset['text']
idx = np.random.randint(0, len(raw_tweets))

print("Raw: ")
print("Tweet: ", raw_tweets[idx])
print("Polarity: ", findpolarity(raw_tweets[idx]))
print()
print("Cleaned: ")
print("Tweet: ", cleaned_tweets[idx])
print("Polarity: ", findpolarity(cleaned_tweets[idx]))

Raw: 
Tweet:  The 16th MAFIACASH Distribution has been completed! https://mafiawars.io/news/mafiacash-distribution-16 \xe2\x80\xa6 Check out the Mafia Wars game at https://mafiawars.io  to start earning MAFIACASH today! #MafiaWars #MafiaCash #Bitcoin #BTC #Counterparty #XCP #BlockchainGamingpic.twitter.com/eE1skh0qHS
Polarity:  -1

Cleaned: 
Tweet:  16th mafiacash distribut complet distribut 16 check mafia war game start earn mafiacash today mafiawar mafiacash bitcoin btc counterparti xcp blockchaingamingp twitter com ee1skh0qh
Polarity:  -1


Find polarity for cleaned tweets

In [6]:
from tqdm import tqdm

sentiment_cleaned = []
for i in tqdm(range(0, len(cleaned_tweets))):
    s = findpolarity(cleaned_tweets[i])
    sentiment_cleaned.append(s)

100%|██████████| 38266/38266 [06:50<00:00, 93.12it/s] 


Find polarity for raw tweets

In [7]:
sentiment_raw = []
for i in tqdm(range(0, len(raw_tweets))):
    s = findpolarity(raw_tweets[i])
    sentiment_raw.append(s)

100%|██████████| 38266/38266 [07:41<00:00, 82.98it/s] 


In [8]:
assert len(cleaned_tweets) == len(dataset['text'])
_n = len(raw_tweets)
_s = 0
for i in range(_n):
    if sentiment_cleaned[i] != sentiment_raw[i]:
        _s+=1
print("Total {} sentiments ain't equal from {}. The ratio is: {}".format(_s, _n, round(_s/_n, 3)))

Total 7623 sentiments ain't equal from 38266. The ratio is: 0.199


We create a new dataframe to store the cleaned tweets and their respective polarities and save them to a .csv file

In [9]:
dataset['sentiment_cl'] = sentiment_cleaned
dataset['sentiment_raw'] = sentiment_raw
dataset.to_csv(re.sub('.csv','_sentiments.csv',FNAME), header=True, index=False, encoding='utf-8')
dataset.head()

Unnamed: 0,date,text,likes,retweets,cleaned_text,sentiment_cl,sentiment_raw
0,2018-12-01 15:13:48,Mainstream media is screaming that \xe2\x80\x9...,373,106,mainstream media scream bitcoin dead twitter t...,-1,-1
1,2018-12-01 22:59:36,My friends had no problem buying #bitcoin at $...,684,101,friend problem buy bitcoin $7k way $20k panic ...,-1,-1
2,2018-12-01 15:19:30,If you pay your Ohio taxes using #bitcoin then...,518,52,pay ohio tax use bitcoin owe even tax btc incr...,-1,-1
3,2018-12-01 18:57:06,Bearableguy123 screen shot on #Reddit \n\nBina...,100,39,bearableguy123 screen shot reddit binari code ...,0,0
4,2018-12-01 13:57:04,$BTC $BTCUSD #Bitcoin \n\nBulls taking profit ...,66,8,$btc $btcusd bitcoin bull take profit everi sm...,1,1
