In [None]:
"""
1. dataset of sentences to overall sentiment
2. create a model from dataset of word to sentiment
2.5 gramatically break down sentence instead of word by word
3. use model for our own sentence input to output overall sentiment of sentence
3.5 if a word is not known --> do sentiment analysis on its dictionary definition or neutralize
"""

In [1]:
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
import nltk.data
import spacy
import json
import random
# need to run first 'python -m spacy download en_core_web_sm'
sp = spacy.load('en_core_web_sm')

lm = WordNetLemmatizer()

def normalize(d):
    mult = 1.0/sum(d.values())
    for key in d:
        d[key] *= mult

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lindseychin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lindseychin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
col_names = ["sentiment", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(
    'training.1600000.processed.noemoticon.csv',
    header=None, encoding="ISO-8859-1", names=col_names)

# normalize to 0 to 1 values
df['sentiment'] = df['sentiment'].replace(4, 1)

X_train, X_test, y_train, y_test = train_test_split(df, df['sentiment'], test_size = 0.10, random_state=0)

df = X_test

In [11]:
NEGATION_WORDS = ['not', 'no']
STOP_WORDS = stopwords.words('english')

for word in NEGATION_WORDS:
    STOP_WORDS.remove(word)

URL_PATTERN = r'((https://[^ ]*|(http://)[^ ]*|( www\.)[^ ]*))'
USER_PATTERN = '@[^\s]+'
PUNCTUATIONS = ['!', '?', '&quot;']

processed = []

# NOTE TO SELF: REMOVE PUNCTUATION BC SPACY
for sentiment, tweet in zip(df['sentiment'], df['text']):
    tweet = tweet.lower()
    tweet = re.sub(URL_PATTERN, '', tweet)
    tweet = re.sub(USER_PATTERN, '', tweet)
    for p in PUNCTUATIONS:
        tweet = tweet.replace(p, '')
    for sw in STOP_WORDS:
        tweet = re.sub(r'\b{0}\b'.format(sw), '', tweet)
    for w in NEGATION_WORDS:
        tweet = re.sub(r'\b{0} \b'.format(w), '{0}_'.format(w), tweet)
        
    processed.append((sentiment, tweet))    
    
df = pd.DataFrame(data=processed, columns=['sentiment', 'tweet'])

In [13]:
"""NAIVE MODEL"""
X_train, X_test, y_train, y_test = train_test_split(df, df['sentiment'], test_size = 0.05, random_state=0)

model = {}
for sentiment, tweet in zip(X_train['sentiment'], X_train['tweet']):
    for word in tweet.split():
        word = lm.lemmatize(word)
        if word in model:
            count, avg = model[word]
            model[word] = (count+1, (count*avg+sentiment) / (count+1))
        else:
            model[word] = (1, sentiment)

In [None]:
"""SUPERVISED TRANSITON + EMISSION MATRICES"""
pos_tweets = X_train[X_train["sentiment"] == 1]
transition = {
    'NOUN': { 'NOUN': 0, 'POS_VERB': 0, 'POS_ADJ': 0, 'NEG_VERB': 0, 'NEG_ADJ': 0},
    'POS_VERB': { 'NOUN': 0, 'POS_VERB': 0, 'POS_ADJ': 0, 'NEG_VERB': 0, 'NEG_ADJ': 0},
    'POS_ADJ': { 'NOUN': 0, 'POS_VERB': 0, 'POS_ADJ': 0, 'NEG_VERB': 0, 'NEG_ADJ': 0},
    'NEG_VERB': { 'NOUN': 0, 'POS_VERB': 0, 'POS_ADJ': 0, 'NEG_VERB': 0, 'NEG_ADJ': 0},
    'NEG_ADJ': { 'NOUN': 0, 'POS_VERB': 0, 'POS_ADJ': 0, 'NEG_VERB': 0, 'NEG_ADJ': 0}
}

emission = {
    'NOUN': {},
    'POS_VERB': {},
    'POS_ADJ': {},
    'NEG_VERB': {},
    'NEG_ADJ': {}
}

VALID_POS = set(['NOUN', 'ADJ', 'VERB', 'PRON', 'PROPN'])

#name pending
def remap_pos(pos, val):
    if pos == 'NOUN' or pos == 'PRON' or pos == 'PROPN':
        return 'NOUN'
    prefix = 'POS_' if val >= 0.5 else 'NEG_'
    return prefix + pos

for tweet in pos_tweets['tweet']:
    sentence = sp(tweet)
    cleaned_sentence = [(lm.lemmatize(word.text), word.pos_) for word in sentence if word.pos_ in VALID_POS]
    for i in range(len(cleaned_sentence)):
        word,pos = cleaned_sentence[i]
        if word not in model: continue
        val = model[word][1]
        pos = remap_pos(pos, val)
        if i != 0:
            prev_word, prev_pos = cleaned_sentence[i-1]
            if prev_word not in model: continue
            prev_val = model[prev_word][1]
            prev_pos = remap_pos(prev_pos, prev_val)
            transition[prev_pos][pos] += 1
        
        emission[pos][word] = emission[pos].get(word, 0) + 1

for key in transition:
    normalize(transition[key])

for key in emission:
    normalize(emission[key])

transition        

In [None]:
"""Baum-Welch approximation for Matrices"""

def distribute(values, num_partitions):
    if num_partitions <= len(values):
        splits = np.array_split(np.array(values), num_partitions)
        return [ i for i in range(len(splits)) for j in range(len(splits[i])) ]
    else:
        return [ random.choice(range(num_partitions)) for value in values ]

def forward(values, tr, em, initial_dist):
    alpha = { x: {} for x in range(len(values)) }

    for s in tr.keys():
        alpha[0][s] = initial_dist[s] * em[s][values[0]]
    for t in range(1, len(values)):
        for s in tr.keys():
            alpha[t][s] = sum([ alpha[t-1][s] * tr[s_p][s] * em[s][values[t]] for s_p in tr.keys() ])

    return alpha

def backward(values, tr, em):
    beta = [ {} for t in range(len(values)) ] 
    beta[-1] = { s: 1 for s in tr.keys() }
    
    for t in range(len(values) - 2, -1, -1):
        for s in tr.keys():
            beta[t][s] = sum([ beta[t+1][s_p] * tr[s][s_p] * em[s_p][values[t+1]] for s_p in tr.keys() ]) 
    return beta

def normalize_tr_em(tr, em):
    print(tr)
    for s in tr.keys():
        normalize(tr[s])
    
    for s in em.keys():
        normalize(em[s])

def bw(tweets, num_states=4):
    # initialize matrices
    tr = {}
    em = {} 
    initial_distribution = {}
    states = []
    
    for i in range(num_states):
        s_i = 's_{0}'.format(i)
        states.append(s_i)
        # initialize tr as random
        tr[s_i] = {}
        for j in range(num_states):
            s_j = 's_{0}'.format(j)
            tr[s_i][s_j] = random.random()
        normalize(tr[s_i])
            
        # also initialize em num of states
        em[s_i] = {}
        
        # also initialize initial distribution
        initial_distribution[s_i] = 1.0 / num_states
    
    for tweet in tweets:
        words = [lm.lemmatize(word) for word in tweet.split()]
        distributed = distribute(words, num_states)
        
        for (word, i) in zip(words, distributed):
            s_i = 's_{0}'.format(i)
            if word not in em[s_i]:
                for j in range(num_states):
                    s_j = 's_{0}'.format(j)
                    em[s_j][word] = 0
            em[s_i][word] += 1
            
    for s in em:
        normalize(em[s])
        
    # improve tr and em
    R = []
    for tweet in tweets:
        words = [lm.lemmatize(word) for word in tweet.split()]
        if len(words) == 0: continue
        
        # estimation
        alpha = forward(words, tr, em, initial_distribution)
        beta = backward(words, tr, em)
        
        # E step
        xi = [ { s1: { s2: 0 for s2 in states} for s1 in states } for x in range(len(words)) ]
        gamma = [ { s1: 0 for s1 in states } for x in range(len(words)) ]
        for t in range(len(words)):
            gamma_denominator = sum([alpha[t][s] * beta[t][s] for s in states])
            if t == len(words) - 1:
                for s1 in states:
                    gamma[t][s1] = alpha[t][s1] * beta[t][s1] / gamma_denominator if gamma_denominator != 0 else 0
                continue
                
            xi_denominator = sum([alpha[t][s1] * tr[s1][s2] * em[s2][words[t+1]] * beta[t+1][s2]
                                  for s1 in states for s2 in states])
            
            for s1 in states:
                for s2 in states:
                    if s1 not in xi[t]: xi[t][s1] = {}

                    # might want to flip alpha in forward function to match beta format
                    xi[t][s1][s2] = alpha[t][s1] * tr[s1][s2] * em[s2][words[t+1]] * beta[t+1][s2] / xi_denominator \
                    if xi_denominator != 0 else 0
                
                gamma[t][s1] = alpha[t][s1] * beta[t][s1] / gamma_denominator if gamma_denominator != 0 else 0
        r_values = ( words, gamma, xi )
        R.append(r_values)
        
    # M step
    for s1 in states:
        tr_denominator = sum([xi[t][s1][s] for s in states for (words, _, xi) in R for t in range(len(words)-1)])
        for s2 in states:
            tr_numerator = sum([xi[t][s1][s2] for (words, _, xi) in R for t in range(len(words)-1) ])
            tr[s1][s2] = tr_numerator / tr_denominator if tr_denominator != 0 else 0

        em_denominator = sum([gamma[t][s1] for (words, gamma, _) in R for t in range(len(words))])
        for w in em[s1]:
            em_numerator = sum([gamma[t][s1] for (words, gamma, _) in R for t in range(len(words)) if words[t] == w])
            if em_denominator == 0:
                em[s1][w] = 0
            else:
                em[s1][w] = em_numerator / em_denominator

    normalize_tr_em(tr, em)
    return (tr, em)

pos_tweets = X_train[X_train["sentiment"] == 1]
bw_tr, bw_em = bw(pos_tweets['tweet'])

In [None]:
"""OLD HMM MODEL"""

# transition_prob = {
#     'neg': { 'pos': 0, 'neg': 0 },
#     'pos': { 'pos': 0, 'neg': 0 }
# }
# emission_prob = {
#     'neg': { },
#     'pos': { }
# }

def state_to_num(s):
    return 1 if s == 'pos' else 0

def num_to_state(n):
    return 'pos' if n >= 0.5 else 'neg'

# for sentiment, tweet in zip(X_train['sentiment'], X_train['tweet']):
#     words = [lm.lemmatize(word) for word in tweet.split()]
    
#     for i in range(len(words)): 
#         wovrd = words[i]
        
#         # transition stuff
#         #if i != 0:
#         #    val = model[word][1]
#         #    current_state = num_to_state(val)
#         #    previous_state = num_to_state(model[words[i-1]][1])
#         #    
#         #    transition_prob[previous_state][current_state] += 1
        
#         actual_state = num_to_state(sentiment)    
#         word_state = num_to_state(model[word][1])
        
#         # transition stuff
#         transition_prob[word_state][actual_state] += 1        
    
#         # emission stuff
#         if word not in emission_prob[actual_state]:
#             emission_prob['pos'][word] = 0
#             emission_prob['neg'][word] = 0
#         emission_prob[actual_state][word] += 1    
        
# normalize -- subtract mean and divide by std
# min-max scaling
# add-one smoothing for naive bayes
            
# # normalize values in transition
# normalize(transition_prob['pos'])
# normalize(transition_prob['neg'])
# # normalize value in emission
# normalize(emission_prob['pos'])
# normalize(emission_prob['neg'])

# (transition_prob, emission_prob)

In [None]:
ratios = { }     
for sentiment, tweet in zip(X_train['sentiment'], X_train['tweet']):
    words = [lm.lemmatize(word) for word in tweet.split()]
    vals = [model[word][1] for word in words if word in model]
    
    pos_count = len(list(filter(lambda v : v >= 0.5, vals)))
    neg_count = len(list(filter(lambda v : v < 0.5, vals)))
    
    ratio = 0.5 if (pos_count + neg_count) == 0 else round(pos_count * 1.0 / (pos_count + neg_count), 3)
    
    if ratio in ratios:
        count, avg = ratios[ratio]
        ratios[ratio] = (count+1, (count*avg+sentiment) / (count+1))
    else:
        ratios[ratio] = (1, sentiment)

ratios

In [None]:
def estimator(tweet):
    words = [lm.lemmatize(word) for word in tweet.split()]
    vals = [model[word][1] for word in words if word in model]
    
    if len(vals) == 0:
        return 0.5
    
    return sum(vals)/len(vals)

def approximate_ratio(ratio):
    rkeys = sorted(ratios.keys())

    if ratio < rkeys[0]:
        return ratios[rkeys[0]][1]
    
    if ratio > rkeys[-1]:
        return ratios[rkeys[-1]][1]
    
    l = 0
    while ratio > rkeys[l]:
        l += 1
                      
    return (ratios[rkeys[l]][1] + ratios[rkeys[l+1]][1])/2

def ratio_estimator(tweet):
    words = [lm.lemmatize(word) for word in tweet.split()]
    vals = [model[word][1] for word in words if word in model]
    
    pos_count = len(list(filter(lambda v : v >= 0.5, vals)))
    neg_count = len(list(filter(lambda v : v < 0.5, vals)))
    
    ratio = 0.5 if (pos_count + neg_count) == 0 else round(pos_count * 1.0 / (pos_count + neg_count), 3)
    
    if ratio not in ratios:
        return approximate_ratio(ratio)
        #ratios[ratio] = (1, approximate_ratio(ratio))
        
    return ratios[ratio][1]

def hmm_estimator(tweet):
    words = [lm.lemmatize(word) for word in tweet.split()]
    
    tr = transition_prob
    em = emission_prob
       
    gl = { 'pos': 0.5, 'neg': 0.5 }

    for i in range(len(words)):
        word = words[i]
        if word not in model:
            continue
            
        word_state = num_to_state(model[word][1])
        
        pr = {
            'pos': tr['pos']['pos'] * gl['pos'] + tr['neg']['pos'] * gl['neg'],
            'neg': tr['pos']['neg'] * gl['pos'] + tr['neg']['neg'] * gl['neg']
        }
        
        gl['pos'] = em['pos'][word] * pr['pos']
        gl['neg'] = em['neg'][word] * pr['neg']
        
        # just doing this results in 69% accuracy.. but not sure if this is right.
        #gl['pos'] = tr[word_state]['pos'] * gl['pos']
        #gl['neg'] = tr[word_state]['neg'] * gl['neg']
        
        normalize(gl)
    
    return gl['pos']

sentiment_pairs = []
for sentiment, tweet in zip(X_test['sentiment'], X_test['tweet']):
    estimated_sentiment = estimator(tweet)
    sentiment_pairs.append((sentiment, estimated_sentiment))
                      
ratio_sentiment_pairs = []
for sentiment, tweet in zip(X_test['sentiment'], X_test['tweet']):
    estimated_sentiment = ratio_estimator(tweet)
    ratio_sentiment_pairs.append((sentiment, estimated_sentiment))
    
hmm_sentiment_pairs = []
for sentiment, tweet in zip(X_test['sentiment'], X_test['tweet']):
    estimated_sentiment = hmm_estimator(tweet)
    hmm_sentiment_pairs.append((sentiment, estimated_sentiment))

sentiment_pairs = hmm_sentiment_pairs

sentiment_pairs

In [None]:
def compute_errors(sentiment_pairs):
    errors = [abs(est - sent) for (sent, est) in sentiment_pairs]
    avg_error = sum(errors)/len(errors)
    var_values = [(error - avg_error)**2 for error in errors]
    variance = sum(var_values)/len(var_values)

    return (avg_error, variance)
    

avg_error, variance = compute_errors(sentiment_pairs)
    
print('Average Error:', avg_error)
print('Variance:', variance)    

In [None]:
rounded_sentiment_pairs = [(sent, 0 if est < 0.5 else 1) for (sent, est) in sentiment_pairs]

# f1 metrics
accuracy = len([1 for (sent, est) in rounded_sentiment_pairs if sent == est]) / len(rounded_sentiment_pairs)

avg_error, variance = compute_errors(rounded_sentiment_pairs)

print('Accuracy:', accuracy)
print('Average Error:', avg_error)
print('Variance:', variance)    

In [None]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
fp = open("test.txt")
data = fp.read()
paragraphs = data.split('\n\n')
paragraph_sentences = [ '\n-----\n'.join(tokenizer.tokenize(paragraph)).split('\n-----\n') for paragraph in paragraphs ]

paragraph_sentences

In [None]:
paragraph_values = [ [ estimator(sentence) for sentence in paragraph ] for paragraph in paragraph_sentences ]
averaged_values = [ sum(paragraph)/len(paragraph) for paragraph in paragraph_values ]

averaged_values

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
plt.plot(range(len(averaged_values)),averaged_values)
plt.show()

In [None]:
from scipy.ndimage.filters import gaussian_filter1d

ysmoothed = gaussian_filter1d(averaged_values, sigma=1)
plt.figure(figsize=(20,10))
plt.plot(range(len(averaged_values)), ysmoothed)
plt.show()

In [None]:
s = sp("Did it work?")
for token in s:
    print(token.pos_)

s[0]