In [31]:
import senta
analyzer = senta.load()

In [28]:
import pandas as pd
import random
import time

labelled = []
with open('data/yelp_labelled.txt') as f:
    for line in f.readlines():
        parts = line.split('\t')
        labelled.append({
            'sentence': parts[0],
            'original_sentiment': int(parts[1].replace('\n', ''))
        })
        
with open('data/amazon_cells_labelled.txt') as f:
    for line in f.readlines():
        parts = line.split('\t')
        labelled.append({
            'sentence': parts[0],
            'original_sentiment': int(parts[1].replace('\n', ''))
        })
        
with open('data/imdb_labelled.txt') as f:
    for line in f.readlines():
        parts = line.split('\t')
        labelled.append({
            'sentence': parts[0],
            'original_sentiment': int(parts[1].replace('\n', ''))
        })

eval_df = pd.DataFrame(columns=['self', 'text_blob'], index=['precision', 'recall', 'accuracy', 'f1'])


In [33]:
from IPython.display import clear_output

counter = 0
limit = 10
extracted = []
#random.shuffle(labelled)
start = time.time()
for entry in labelled[:limit]:
    counter+=1
    sentence = entry['sentence']
    result = analyzer.analyze(sentence)
    progress = int((counter/limit)*100)
    if progress % 10 == 0:
        clear_output()
        print(f"{progress}%")
    entry['computed_sentiment'] = result

time_senta = time.time() - start
print(f"elapsed: {time_senta}")


100%
elapsed: 3172.0695326328278


In [98]:
# Evaluation

tn = 0
tp = 0
fn = 0
fp = 0

for entry in labelled:
    if 'computed_sentiment' in entry:
        original = entry['original_sentiment'] # 1 = Pos; 0 = Neg
        computed = entry['computed_sentiment']
        if original == 0:
            if computed == 0:
                tn+=1
            elif computed == 1:
                fp+=1
        elif original == 1:
            if computed == 0:
                fn+=1
            elif computed == 1:
                tp+=1

precision = (tp/(tp+fp))
recall = (tp/(tp+fn))
accuracy = (tp+tn)/(tp+tn+fp+fn)
f1 = 2 * (precision * recall)/(precision + recall)

eval_df = pd.DataFrame(columns=['self', 'text_blob'], index=['precision', 'recall', 'accuracy', 'f1'])

print(precision)

eval_df.at['precision', 'self'] = precision
eval_df.at['recall', 'self'] = recall
eval_df.at['accuracy', 'self'] = accuracy
eval_df.at['f1', 'self'] = f1


0.6907059421025902


In [100]:
# TextBlob eval
from textblob import TextBlob

tn_tb = 0
tp_tb = 0
fn_tb = 0
fp_tb = 0

start = time.time()
for entry in labelled:
    if 'original_sentiment' in entry:
        blob = TextBlob(entry['sentence'])
        original = entry['original_sentiment'] # 1 = Pos; 0 = Neg
        computed = 0 if blob.sentiment.polarity < 0 else 1
        if original == 0:
            if computed == 0:
                tn_tb+=1
            elif computed == 1:
                fp_tb+=1
        elif original == 1:
            if computed == 0:
                fn_tb+=1
            elif computed == 1:
                tp_tb+=1
time_tb = time.time() - start
print(f"elapsed: {time_tb}")
                
precision_tb = (tp_tb/(tp_tb+fp_tb))
recall_tb = (tp_tb/(tp_tb+fn_tb))
accuracy_tb = (tp_tb+tn_tb)/(tp_tb+tn_tb+fp_tb+fn_tb)
f1_tb = 2 * (precision_tb * recall_tb)/(precision_tb + recall_tb)
                
eval_df.at['precision', 'text_blob'] = precision_tb
eval_df.at['recall', 'text_blob'] = recall_tb
eval_df.at['accuracy', 'text_blob'] = accuracy_tb
eval_df.at['f1', 'text_blob'] = f1_tb

elapsed: 0.4325084686279297


In [106]:
from bokeh.models import ColumnDataSource, NumeralTickFormatter, LabelSet, FactorRange
from bokeh.io import export_png
from bokeh.plotting import figure, show, output_notebook
import math

# score_type | self | text_blob
# precision  | ...  | ...
# recall     | ...  | ...
# accuracy   | ...  | ...
# f1         | ...  | ...
# ...

scores = ['precision', 'recall', 'accuracy', 'f1']
evals = ['SentA', 'TextBlob'] # ...

x = [(score, result) for score in scores for result in evals]
counts = sum([(x, y) for x, y in eval_df.values], ())
counts_percentage = sum([("%.2f" % x, "%.2f" % y) for x, y in eval_df.values], ())
color = ('#f78911', '#18b0ea', '#f78911', '#18b0ea', '#f78911', '#18b0ea', '#f78911', '#18b0ea')

source = ColumnDataSource(data=dict(x=x, counts=counts, percentage=counts_percentage, color=color))

plot_eval = figure(width=1500, height=800, x_range=FactorRange(*x))

plot_eval.vbar(x='x', top='counts', source=source, width=.8, color="color")

plot_eval.yaxis.major_label_text_font_size="20pt"
plot_eval.xaxis.major_label_text_font_size="20pt"

labels = LabelSet(x='x', y='counts', text='percentage', level='glyph',
        x_offset=0, y_offset=0, source=source, render_mode='canvas',
                 text_font_size="20pt", text_align="center")

plot_eval.add_layout(labels)
plot_eval.xgrid.grid_line_color = None
plot_eval.ygrid.grid_line_color = None

show(plot_eval)
#export_png(plot_eval, 'presentation/src/eval_results.png')

In [107]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import DatetimeTickFormatter
import datetime

source_time = ColumnDataSource(
    data=dict(
        time=[time_senta, time_tb], 
        t=['Senta', 'TextBlob'], 
        formatted=[str(datetime.timedelta(seconds=time_senta)), str(datetime.timedelta(seconds=time_tb))],
        color=['#f78911', '#18b0ea']
    ))

plot_time = figure(width=1500, height=800, x_range=['Senta', 'TextBlob'])
plot_time.vbar(top='time', x='t', width=0.8, source=source_time, color="color")

plot_time.yaxis.axis_label = "Seconds"

labels_time = LabelSet(x='t', y='time', text='formatted',
        x_offset=0, y_offset=0, source=source_time,
                 text_font_size="20pt", text_align="center")

plot_time.yaxis.major_label_text_font_size="20pt"
plot_time.xaxis.major_label_text_font_size="20pt"

plot_time.xgrid.grid_line_color = None
plot_time.ygrid.grid_line_color = None
plot_time.add_layout(labels_time)
show(plot_time)