In [50]:
import pandas as pd
sent_tweets = pd.read_csv("data/sent-tweets.csv").drop(columns=['user_location', 'user_description', 
                                                           'user_followers', 'user_friends',])
sent_tweets.head(5)

Unnamed: 0,date,tweets,score
0,2021-02-05 10:52:04,AT_USER AT_USER AT_USER right here w/ AT_USER ...,0.0
1,2021-02-05 10:52:04,AT_USER AT_USER please donate bitcoin19 donate...,0.6597
2,2021-02-05 10:52:06,$sos market cap is 308 million. if they’re min...,0.0
3,2021-02-05 10:52:07,"bitcoin btc current price (gbp): £34,880 like ...",0.3612
4,2021-02-05 10:52:26,AT_USER right here w/ AT_USER URL referral cod...,0.0


linear svc
mb naive bayes
random forest

In [61]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, TweetTokenizer

tweets = list(zip(sent_tweets["tweets"], sent_tweets["score"]))

regextk = RegexpTokenizer('\s+', gaps=True)
tweettk = TweetTokenizer()

tokens = [(regextk.tokenize(tweet), sentiment) for (tweet, sentiment) in tweets if type(tweet) == str]

filtered = []
for tweet in tokens:
    new = []
    for tok in tweet[0]:
        if tok != "AT_USER" and tok != "URL":
            new.append(tok)
            
    filtered.append((new, tweet[1]))

tagged = [(nltk.pos_tag(tweet), sentiment) for tweet, sentiment in filtered]

tagged[3]

([('bitcoin', 'NN'),
  ('btc', 'NN'),
  ('current', 'JJ'),
  ('price', 'NN'),
  ('(gbp):', 'NN'),
  ('£34,880', 'IN'),
  ('like', 'IN'),
  ('my', 'PRP$'),
  ('updates?', 'NN'),
  ('you', 'PRP'),
  ('can', 'MD'),
  ('tip', 'VB'),
  ('me', 'PRP'),
  ('at', 'IN'),
  ('3l9dztlqrcxnpn89v6gfnrbaz95uq5vmrz', 'CD')],
 0.3612)

In [62]:
import string
from nltk.corpus import wordnet as wn

def wn_pos(tag):
    "converts treebank tags into wordbank tags for lemmatization"
    if tag.startswith('J'):
        return wn.ADJ
    if tag.startswith('V'):
        return wn.VERB
    if tag.startswith('N'):
        return wn.NOUN
    if tag.startswith('R'):
        return wn.ADV
    return None

lem_tweets = []
lem = WordNetLemmatizer()

for tweet in tagged:
    lemmas = []
    
    for word, tag in tweet[0]:
        wn_tag = wn_pos(tag)
        
        if word[-1] in string.punctuation:
                word = word[:-1]

        if wn_pos(tag) is not None:
            lemmas.append(lem.lemmatize(word, wn_tag))
        else:
            lemmas.append(lem.lemmatize(word))
                
    lem_tweets.append((lemmas, tweet[1]))

lemmas = [lem for tweet in lem_tweets for lem in tweet]

lem_tweets[3]

(['bitcoin',
  'btc',
  'current',
  'price',
  '(gbp)',
  '£34,880',
  'like',
  'my',
  'update',
  'you',
  'can',
  'tip',
  'me',
  'at',
  '3l9dztlqrcxnpn89v6gfnrbaz95uq5vmrz'],
 0.3612)

In [64]:
pd.DataFrame(lem_tweets).to_csv("lem_tweets.csv")

# Above preprocessing to be run only once

In [66]:
pd.read_csv("lem_tweets.csv")

Unnamed: 0.1,Unnamed: 0,0,1
0,0,"['right', 'here', 'w', 'referral', 'code', '35...",0.0000
1,1,"['please', 'donate', 'bitcoin19', 'donate', 'c...",0.6597
2,2,"['$sos', 'market', 'cap', 'be', '308', 'millio...",0.0000
3,3,"['bitcoin', 'btc', 'current', 'price', '(gbp)'...",0.3612
4,4,"['right', 'here', 'w', 'referral', 'code', '71...",0.0000
...,...,...,...
9995,9995,"['elon', 'musk', 'again', '', 'bitcoin', 'hit'...",0.0000
9996,9996,"['crazy', 'when', 'binance', 'freeze', 'bybit'...",-0.4003
9997,9997,"['bitcoin', 'hit', '$44,000', 'bitcoin', 'btc'...",0.0000
9998,9998,"['how', 'many', 'short', 'be', 'liquidated', '...",0.0000


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

count = CountVectorizer()
train_counts = count.fit_transform(train["Tweet"])

tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)

In [None]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB().fit(train_tfidf, train["Sentiment"])

val_counts = count.transform(val["Tweet"])
val_tfidf = tfidf_transformer.transform(val_counts)

mnb_pred = MNB.predict(val_tfidf)

MNB_df = pd.DataFrame(zip(val["Tweet"], mnb_pred, val["Sentiment"]), columns=["Tweet", "Predicted", "Actual"])

display(MNB_df)

print("Sentiment prediction accuracy:", np.mean(mnb_pred == val["Sentiment"]))