In [None]:
"""
1. dataset of sentences to overall sentiment
2. create a model from dataset of word to sentiment
2.5 gramatically break down sentence instead of word by word
3. use model for our own sentence input to output overall sentiment of sentence
3.5 if a word is not known --> do sentiment analysis on its dictionary definition or neutralize
"""

In [2]:
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split

lm = WordNetLemmatizer()

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fdwraith/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
col_names = ["sentiment", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(
    'training.1600000.processed.noemoticon.csv',
    header=None, encoding="ISO-8859-1", names=col_names)

In [4]:
NEGATION_WORDS = ['not', 'no']
STOP_WORDS = stopwords.words('english')

for word in NEGATION_WORDS:
    STOP_WORDS.remove(word)

URL_PATTERN = r'((https://[^ ]*|(https://)[^ ]*|( www\.)[^ ]*))'
USER_PATTERN = '@[^\s]+'
PUNCTUATIONS = ['!', '?']

processed = []
for sentiment, tweet in zip(df['sentiment'], df['text']):
    tweet = tweet.lower()
    tweet = re.sub(URL_PATTERN, '', tweet)
    tweet = re.sub(USER_PATTERN, '', tweet)
    for p in PUNCTUATIONS:
        tweet = tweet.replace(p, '')
    for sw in STOP_WORDS:
        tweet = re.sub(r'\b{0}\b'.format(sw), '', tweet)
    for w in NEGATION_WORDS:
        tweet = re.sub(r'\b{0} \b'.format(w), '{0}_'.format(w), tweet)
        
    processed.append((sentiment, tweet))    
    
df = pd.DataFrame(data=processed, columns=['sentiment', 'tweet'])

df

Unnamed: 0,sentiment,tweet
0,0,"http://twitpic.com/2y1zl - awww, ' bummer. ..."
1,0,upset ' update facebook texting ... migh...
2,0,dived many times ball. managed save 50% ...
3,0,whole body feels itchy like fire
4,0,"no, ' not_behaving . ' mad. ' see ."
...,...,...
1599995,4,woke . no_school best feeling ever
1599996,4,thewdb.com - cool hear old walt interviews ...
1599997,4,ready mojo makeover ask details
1599998,4,happy 38th birthday boo alll time tupac ama...


In [5]:
X_train, X_test, y_train, y_test = train_test_split(df, df['sentiment'], test_size = 0.05, random_state=0)

model = {}
for sentiment, tweet in zip(X_train['sentiment'], X_train['tweet']):
    for word in tweet.split():
        word = lm.lemmatize(word)
        if word in model:
            count, avg = model[word]
            model[word] = (count+1, (count*avg+sentiment) / (count+1))
        else:
            model[word] = (1, sentiment)
                        
model

{'working': (12816, 1.5767790262172305),
 'double': (1316, 1.8085106382978717),
 'texas': (781, 1.772087067861715),
 'roadhouse': (27, 2.3703703703703702),
 'envy': (330, 1.5636363636363644),
 'everyone': (13206, 2.5957897925185445),
 'thats': (8601, 2.18858272293918),
 'not_working': (1366, 0.4158125915080529),
 'today.l': (1, 0),
 'nice': (19363, 2.9466508289004767),
 'day': (79907, 2.1176617818213757),
 'hehe,': (367, 3.4550408719346035),
 'thanks': (33405, 3.4724143092351416),
 'class': (5577, 1.515510130894745),
 'laughing': (853, 2.869871043376322),
 '.': (127615, 1.8608157348274081),
 'woot': (1077, 3.3611884865366757),
 'haha': (20783, 2.7226098253380027),
 'gonna': (21308, 1.8143420311620055),
 'go': (69619, 1.5360174664962138),
 'movie': (12675, 2.6104930966469437),
 "'": (498154, 1.749185994692401),
 'uncle': (690, 1.889855072463767),
 'iphone': (5673, 1.0618720253833986),
 'pissed': (1401, 0.5596002855103495),
 'use': (7059, 2.1068139963167547),
 'crappy': (1049, 0.65204957

In [9]:
def estimator(tweet):
    words = [lm.lemmatize(word) for word in tweet.split()]
    vals = [model[word][1] for word in words if word in model]
    
    if len(vals) == 0:
        return 2
    
    return sum(vals)/len(vals)

sentiment_pairs = []
for sentiment, tweet in zip(X_test['sentiment'], X_test['tweet']):
    estimated_sentiment = estimator(tweet)
    sentiment_pairs.append((sentiment, estimated_sentiment))
    
sentiment_pairs

[(0, 1.5839900727834506),
 (0, 1.5721726601580768),
 (0, 2.1396497664770506),
 (0, 1.6460139491615688),
 (4, 1.982592375292591),
 (0, 1.88795464805599),
 (4, 2.256543347769165),
 (4, 2.3322536784989154),
 (4, 1.9010275211968737),
 (0, 1.922437225440779),
 (4, 2.3074437296757626),
 (4, 1.9509461967257447),
 (4, 2.411185775436351),
 (4, 2.3311937128903115),
 (4, 2.3851317270011436),
 (0, 1.7662076819677497),
 (4, 2.3084115388123108),
 (4, 1.9854424972936946),
 (0, 2.6352219108586996),
 (0, 1.9555163777165645),
 (0, 1.4456268116108597),
 (4, 2.1148473996692805),
 (0, 1.9365477144265337),
 (4, 2),
 (4, 1.549972072275418),
 (0, 1.4786560550384784),
 (4, 1.9496209657950134),
 (4, 1.9220501868659907),
 (4, 2.21033806217282),
 (0, 1.5195641691473964),
 (4, 2.561419982646696),
 (0, 1.6259258935998013),
 (4, 3.2),
 (4, 1.8691001568520982),
 (0, 2.1093668118645024),
 (0, 1.5459708365310831),
 (0, 1.3905712973422995),
 (0, 2.2037020991555156),
 (0, 1.014029451134387),
 (4, 2),
 (4, 2.6830175777580

In [13]:
def compute_errors(sentiment_pairs):
    errors = [abs(est - sent) for (sent, est) in sentiment_pairs]
    avg_error = sum(errors)/len(errors)
    var_values = [(error - avg_error)**2 for error in errors]
    variance = sum(var_values)/len(var_values)

    return (avg_error, variance)
    

avg_error, variance = compute_errors(sentiment_pairs)
    
print('Average Error:', avg_error)
print('Variance:', variance)    

Average Error: 1.7595626170093652
Variance: 0.13392011178817942


In [15]:
rounded_sentiment_pairs = [(sent, 0 if est < 2 else 4) for (sent, est) in sentiment_pairs]

accuracy = len([1 for (sent, est) in rounded_sentiment_pairs if sent == est]) / len(rounded_sentiment_pairs)

avg_error, variance = compute_errors(rounded_sentiment_pairs)

print('Accuracy:', accuracy)
print('Average Error:', avg_error)
print('Variance:', variance)    

Accuracy: 0.7632375
Average Error: 0.94705
Variance: 2.8912962974979712
