In [1]:
import pandas as pd
import numpy as np
import re
import pickle

import nltk
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [2]:
positive_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tokens = twitter_samples.tokenized('negative_tweets.json')

In [3]:
def clean_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    clean_list = []
    for token, tag in pos_tag(tokens):
        token = token.lower()
        token = re.sub(r'@[a-z0-9_]\S+', '', token)
        token = re.sub(r'#[a-z0-9_]\S+', '', token)
        token = re.sub(r'&[a-z0-9_]\S+', '', token)
        token = re.sub(r'[?!.+,;$£%&"]+', '', token)
        token = re.sub(r'rt[\s]+', '', token)
        token = re.sub(r'\d+', '', token)
        token = re.sub(r'\$', '', token)
        token = re.sub(r'rt+', '', token)
        token = re.sub(r'https?:?\/\/\S+', '', token)
        if tag.startswith('NN'):
            position = 'n'
        elif tag.startswith('VB'):
            position = 'v'
        elif tag.startswith('RB'):
            position = 'r'
        elif tag.startswith('JJ'):
            position = 'a'
        else:
            position = 'n'

        clean_list.append(lemmatizer.lemmatize(token, pos = position))
        clean_list = [i for i in clean_list if i not in stop_words and len(i) > 0 and i != ':']

    return clean_list

In [4]:
def data_prepare(tokens, status):
    featureset = [(tweet, status) for tweet in tokens]
    return featureset

In [5]:
def featureset_prepare():
    positive_featureset = data_prepare(list(map(clean_tokens, positive_tokens)), 'Positive')
    negative_featureset = data_prepare(list(map(clean_tokens, negative_tokens)), 'Negative')
    featureset = positive_featureset + negative_featureset

    features = []
    labels = []

    for x in featureset:
        features.append(x[0])
        labels.append(x[1])
    return features, labels

In [6]:
def vader_compound_score(tweet):
    vader = SentimentIntensityAnalyzer()
    if vader.polarity_scores(tweet)['compound'] >= 0:
        return 'Positive'
    else:
        return 'Negative'

In [7]:
def textblob_sentiment(tweet):
    analysis = TextBlob(tweet)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    else:
        return 'Negative'

In [8]:
def rf_sentiment(tweet):
    classifier = pickle.load(open('rf_classifier', 'rb'))
    vectorizer = pickle.load(open('rf_vectorizer', 'rb'))

    return classifier.predict(vectorizer.transform(list(tweet)))

In [9]:
features = featureset_prepare()[0]
labels = featureset_prepare()[1]

print(features[1])
print(labels[1])
print(features[-1])
print(labels[-1])

['hey', 'james', 'odd', ':/', 'please', 'call', 'contact', 'centre', 'able', 'assist', ':)', 'many', 'thanks']
Positive
['hull', 'suppoer', 'expect', 'misserable', 'week', ':-(']
Negative


In [12]:
df = pd.DataFrame(columns = ['Tweet', 'Real_sent'])
df['Tweet'] = pd.Series(features).values
df['Real_sent'] = pd.Series(labels).values
df

Unnamed: 0,Tweet,Real_sent
0,"[top, engage, member, community, week, :)]",Positive
1,"[hey, james, odd, :/, please, call, contact, c...",Positive
2,"[listen, last, night, :), bleed, amazing, trac...",Positive
3,"[congrats, :)]",Positive
4,"[yeaaaah, yippppy, accnt, verify, rqst, succee...",Positive
...,...,...
9995,"[wanna, change, avi, usanele, :(]",Negative
9996,"[puppy, broke, foot, :(]",Negative
9997,"[where's, jaebum, baby, picture, :(, (]",Negative
9998,"[mr, ahmad, maslan, cook, :(]",Negative


In [13]:
df['RandomForest'] = rf_sentiment(df['Tweet'])
df['Tweet'] = [', '.join(map(str, token)) for token in df['Tweet']]
df['Vader'] = df['Tweet'].apply(vader_compound_score)
df['TextBlob'] = df['Tweet'].apply(textblob_sentiment)
df['TB_accuracy'] = np.where(df['Real_sent'] == df['TextBlob'], 1, 0)
df['VA_accuracy'] = np.where(df['Real_sent'] == df['Vader'], 1, 0)
df['RF_accuracy'] = np.where(df['Real_sent'] == df['RandomForest'], 1, 0)
df

Unnamed: 0,Tweet,Real_sent,RandomForest,Vader,TextBlob,TB_accuracy,VA_accuracy,RF_accuracy
0,"top, engage, member, community, week, :)",Positive,Positive,Positive,Positive,1,1,1
1,"hey, james, odd, :/, please, call, contact, ce...",Positive,Positive,Positive,Positive,1,1,1
2,"listen, last, night, :), bleed, amazing, track...",Positive,Positive,Positive,Positive,1,1,1
3,"congrats, :)",Positive,Positive,Positive,Positive,1,1,1
4,"yeaaaah, yippppy, accnt, verify, rqst, succeed...",Positive,Positive,Positive,Positive,1,1,1
...,...,...,...,...,...,...,...,...
9995,"wanna, change, avi, usanele, :(",Negative,Negative,Negative,Negative,1,1,1
9996,"puppy, broke, foot, :(",Negative,Negative,Negative,Negative,1,1,1
9997,"where's, jaebum, baby, picture, :(, (",Negative,Negative,Positive,Negative,1,0,1
9998,"mr, ahmad, maslan, cook, :(",Negative,Negative,Negative,Negative,1,1,1


In [14]:
print('Accuracy of TextBlob approach is:', (len(df[df['TB_accuracy'] == 1]) / len(df)) * 100, '%')
print('Accuracy of Vader approach is:', (len(df[df['VA_accuracy'] == 1]) / len(df)) * 100, '%')
print('Accuracy of Random Forest approach is:', (len(df[df['RF_accuracy'] == 1]) / len(df)) * 100, '%')

Accuracy of TextBlob approach is: 93.19 %
Accuracy of Vader approach is: 79.86999999999999 %
Accuracy of Random Forest approach is: 92.67999999999999 %
