# Load Spacy Model

In [1]:
import spacy
spacy.tokens.Token.set_extension("data", default=False, force=True)

nlp = spacy.load('en_core_web_lg')

## Import the wordlists


In [2]:
from typing import Dict, List

def load_wordlist(path: str) -> list:
    """Loads a list of words
    
    Loads a list of words from a supplied path into a list and returns it.
    It expects the wordlist to be a txt file in which every new line contains 
    one word. The function removes all new line characters ('\n') and ignores
    lines that start with a semicolon (;) or lines that are empty.
    
    Args:
        path: The path of the file that should be opened as a str
        
    Returns:
        A list of str that contains all words from the wordlist file without any
        new lines characters.
        For example:
        
        ['abrasive', 'abrupt', 'abruptly', 'abscond']
   
    Raises:
        FileNotFoundError: If the supplied path does not exist
    """
    lines: list
    with open(path) as f:
        lines = [line.replace('\n', '') for line in f.readlines() 
                  if not line.startswith(';') and not line.startswith('\n')]

    return lines

def load_lexicon() -> Dict[str, spacy.tokens.doc.Doc]:
    """Loads both sentiment's wordlists
    
    Loads the wordslists for both the negative and positive sentiment into a 
    dict with the sentiment as key. After the lists are retrieved from the files
    they are loaded into a spacy.tokens.doc.Doc.
    
    Returns:
        A dict mapping the negative and positive sentiment to a 
        spacy.tokens.doc.Doc.
    """
    negative_list = load_wordlist('data/sentiments/negative-words.txt')
    positive_list = load_wordlist('data/sentiments/positive-words.txt')
    return {
        'negative': nlp(' '.join(negative_list)),
        'positive': nlp(' '.join(positive_list))
    }

lexicon: Dict[str, spacy.tokens.doc.Doc] = load_lexicon()

In [3]:
from spacy.tokens import Token


def load_list():
    result = {}
    with open("data/sentiments/subjectivity.ttf") as f:
        for line in f.readlines():
            line_result = {}
            for entry in line.split(" "):
                temp = entry.split("=")
                if len(temp) == 2:
                    line_result[temp[0]] = temp[1].replace("\n", "")
            result[line_result['word1']] = line_result
    
    doc = nlp(' '.join([x for x in result.keys() if not '-' in x]))
    for token in doc:
        token._.data = result[token.text]

    return doc
lexicon = load_list()

In [4]:
def review_token(token: spacy.tokens.Token) -> bool:
    if token.is_punct: return False
    
    if token.pos_ in ['NOUN', 'ADJ', 'VERB']: return True
    
    if token.dep_ == 'neg': return True

def review_wordlist(sentence: str) -> str:
    """Removes stopwords and punctuation characters from a string of words
    
    Loads a string into a spacy.tokens.doc.Doc and removes all stopwords and
    punctuation characters. The remaining words are lemmatized.
    
    For example: 'Everything was fresh and delicious!' turns into 'fresh 
    delicious'.
    
    Args:
        sentence: The sentence that should be filtered
    
    Returns:
        The supplied str without any stopwords or punctuation characters.
    """
    doc = nlp(sentence)
    filtered_words = [token.lemma_ for token in doc if review_token(token)]
    return ' '.join(filtered_words)

print(review_wordlist('Crust is not good'))

crust be not good


In [5]:
def is_pos_matching(token, word):
    token_pos = token._.data['pos1']
    #print(token_pos, word.pos_)
    if token_pos == 'anypos':
        return True

    return token_pos.lower() == word.pos_.lower()

def most_similar(word: spacy.tokens.token.Token):
    result = {}
    if word.vector_norm:
        queries = {
            'negative': [],
            'positive': []
        }
        for token in lexicon:
            #print(is_pos_matching(token, word))
            if token.prob > -15 and token.vector_norm and is_pos_matching(token, word):
                sentiment = token._.data['priorpolarity']
                similarity = word.similarity(token)
                if sentiment == 'both':
                    queries['negative'].append(similarity)
                    queries['positive'].append(similarity)
                elif sentiment != 'neutral':
                    queries[sentiment].append(similarity)

        for sentiment, result_list in queries.items():
            
            result[sentiment] = sorted(result_list, key=lambda w: w, reverse=True)[:10]
    return result

print(most_similar(nlp("good")[0]))

{'negative': [0.7373284, 0.73550904, 0.6328432, 0.61780876, 0.60963994, 0.6093343, 0.59364223, 0.5919895, 0.5822323, 0.57866997], 'positive': [1.0, 0.84167075, 0.8176646, 0.79754514, 0.7774079, 0.77341545, 0.7254699, 0.71151584, 0.7089218, 0.70461506]}


# TODO

- Cleanup texts of markdown quotes (they mess with the actual text being written)
- Add some emphasize on where in the sentence a word is (subj, verb, idk?)
- ....

In [6]:
import numpy as np

def analyze(word_list):
    result = {
        'negative': [],
        'positive': []
    }
    for word in nlp(review_wordlist(word_list)):
        #print(word, word.pos_)
        
        similar_words = most_similar(word)
        #print(similar_words)
        if len(similar_words) > 0:
            for sentiment, s_word in similar_words.items():
                result[sentiment].extend(s_word)
    
    
    #print(result)
    
    negative = np.mean(sorted(result['negative'], reverse=True)[:10])
    positive = np.mean(sorted(result['positive'], reverse=True)[:10])
    
    #print(negative)
    #print(positive)
    
    if negative > positive:
        return 0
    elif negative < positive:
        return 1
    else:
        # Both are 0 or otherwise the same (very unlikely)
        return -999

#print(analyze('The food was bad'))
# Special cases: 
# - Would not go back. (gets completetly filtered by review_wordlist)

In [7]:
labelled = []
with open('data/yelp_labelled.txt') as f:
    for line in f.readlines():
        parts = line.split('\t')
        labelled.append({
            'sentence': parts[0],
            'original_sentiment': int(parts[1].replace('\n', ''))
        })

In [1]:
sentence = "Not tasty and the texture was just nasty."

print(analyze(sentence))

NameError: name 'analyze' is not defined

In [9]:
import random
counter = 0
limit = 50
extracted = []
random.shuffle(labelled)
for entry in labelled[:limit]:
    counter+=1
    sentence = entry['sentence']
    result = analyze(sentence)
    print(counter, sentence, result)
    entry['computed_sentiment'] = result
    if result is not entry['original_sentiment']:
        extracted.append(sentence)

1 The only thing I wasn't too crazy about was their guacamole as I don't like it puréed. 0
2 Main thing I didn't enjoy is that the crowd is of older crowd, around mid 30s and up. 1
3 Overall, I like this place a lot. 1
4 My sashimi was poor quality being soggy and tasteless. 0
5 Awful service. 0
6 The desserts were a bit strange. 0
7 We could not believe how dirty the oysters were! 0
8 Couldn't ask for a more satisfying meal. 1
9 I was disgusted because I was pretty sure that was human hair. 0
10 Overall, a great experience. 1
11 This hole in the wall has great Mexican street tacos, and friendly staff. 1
12 These were so good we ordered them twice. 1
13 Ordered a double cheeseburger & got a single patty that was falling apart (picture uploaded) Yeah, still sucks. 1
14 It's close to my house, it's low-key, non-fancy, affordable prices, good food. 1
15 The best place to go for a tasty bowl of Pho! 1
16 What SHOULD have been a hilarious, yummy Christmas Eve dinner to remember was the bigg

In [10]:
#print(extracted)
#for ex in extracted:
    #doc_dep = nlp(ex)
    #spacy.displacy.render(doc_dep, style='dep')

do = nlp("Crust was not good")

for chunk in do.noun_chunks:
    print(chunk)
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)


Crust
Crust Crust nsubj was


In [11]:
from bokeh.plotting import figure, show, output_notebook
import pandas as pd

output_notebook()

x = {
    'correct': 0,
    'wrong': 0,
    'not': 0
}

for entry in labelled:
    if 'computed_sentiment' in entry:
        if entry['computed_sentiment'] == -999:
            x['not'] +=1
        elif entry['computed_sentiment'] == entry['original_sentiment']:
            x['correct']+=1
        elif entry['computed_sentiment'] != entry['original_sentiment']:
            x['wrong']+=1

s = sum(x.values())
print(x['correct']/s)
print(x['wrong']/s)
print(x['not']/s)

            
data = pd.Series(x).reset_index(name='value')
p = figure(x_range=list(x.keys()))

p.vbar(source=data, width=.9, x ='index', top='value')
show(p)

0.68
0.32
0.0


In [68]:
# Evaluation

tn = 0
tp = 0
fn = 0
fp = 0

for entry in labelled:
    if 'computed_sentiment' in entry:
        original = entry['original_sentiment'] # 1 = Pos; 0 = Neg
        computed = entry['computed_sentiment']
        if original == 0:
            if computed == 0:
                tn+=1
            elif computed == 1:
                fp+=1
        elif original == 1:
            if computed == 0:
                fn+=1
            elif computed == 1:
                tp+=1

precision = (tp/(tp+fp))
recall = (tp/(tp+fn))
accuracy = (tp+tn)/(tp+tn+fp+fn)
f1 = 2 * (precision * recall)/(precision + recall)
    

eval_df = pd.DataFrame(columns=['self', 'text_blob'], index=['precision', 'recall', 'accuracy', 'f1'])



eval_df.at['precision', 'self'] = precision
eval_df.at['recall', 'self'] = recall
eval_df.at['accuracy', 'self'] = accuracy
eval_df.at['f1', 'self'] = f1

eval_df.at['precision', 'text_blob'] = 0
eval_df.at['recall', 'text_blob'] = 0
eval_df.at['accuracy', 'text_blob'] = 0
eval_df.at['f1', 'text_blob'] = 0





In [73]:
from bokeh.models import ColumnDataSource, NumeralTickFormatter, LabelSet, FactorRange
from bokeh.io import export_png

# score_type | self | text_blob
# precision  | ...  | ...
# recall     | ...  | ...
# accuracy   | ...  | ...
# f1         | ...  | ...
# ...

scores = ['precision', 'recall', 'accuracy', 'f1']
evals = ['self', 'text_blob'] # ...

x = [(score, result) for score in scores for result in evals]
counts = sum([(x, y) for x, y in eval_df.values], ())

source = ColumnDataSource(data=dict(x=x, counts=counts))

plot_eval = figure(x_range=FactorRange(*x))

plot_eval.vbar(x='x', top='counts', source=source, width=.8)

show(plot_eval)
export_png(plot_eval, 'presentation/src/eval_results.png')

RuntimeError: PhantomJS is not present in PATH or BOKEH_PHANTOMJS_PATH. Try "conda install phantomjs" or             "npm install -g phantomjs-prebuilt"