In [91]:
import nltk
nltk.download('sentiwordnet')
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score

[nltk_data] Downloading package sentiwordnet to C:\Users\Nathan
[nltk_data]     Hua\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nathan
[nltk_data]     Hua\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [92]:
# read data from txt
def read_data(set_name):
    text_file_name  = set_name + "_text.txt"
    label_file_name = set_name + "_labels.txt"
    text_file = open("data/" + text_file_name, "r", encoding="utf8")
    label_file = open("data/" + label_file_name, "r", encoding="utf8")
    x = text_file.readlines()
    y = label_file.readlines()
    for i in range(len(y)): y[i] = y[i][0]
    return pd.DataFrame(x, columns=["text"]), pd.DataFrame(y, columns=["label"])
train_set_x, train_set_y = read_data("train")
val_set_x,   val_set_y   = read_data("val")
test_set_x,  test_set_y  = read_data("test")

In [109]:
stopwords = set(nltk.corpus.stopwords.words("english"))
additional_stopwords = [".", ",", "'s", "``", "''", "'",
                        "n't", "%", "-", "$", "(", ")", ":",
                        ";", "@", "&", "'m", "user", "#", "!",
                        "?", "...", "a"]
for sw in additional_stopwords: stopwords.add(sw)
lemmatizer = WordNetLemmatizer()
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

# level 1-4: consider wn.VERB, wn.NOUN, wn.ADJ, wn.ADV
# level len(consider_list): consider all
consider_list = [wn.VERB, wn.NOUN, wn.ADJ, wn.ADV]
def get_senti_score(sentence, level=len(consider_list)):
    token = nltk.word_tokenize(sentence)
    # remove stop words
    index = len(token) - 1
    while index >= 0:
        if token[index] in stopwords:
            token.pop(index)
        index -= 1
    after_tagging = nltk.pos_tag(token)
    sentiment = 0.0
    objective = 0.0
    tokens_count = 0
    for word, tag in after_tagging:
        wn_tag = penn_to_wn(tag)
        if level == len(consider_list):
            if wn_tag not in consider_list: continue
        else:
            if wn_tag != consider_list[level]: continue

        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        if not lemma: continue

        synsets = list(swn.senti_synsets(lemma, pos=wn_tag))
        if not synsets: continue

        swn_synset = synsets[0]

        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
        objective += swn_synset.obj_score()
        tokens_count += 1
    if tokens_count == 0: return [0,0]
    return [sentiment / tokens_count, objective / tokens_count]
get_senti_score("It was a really good day")

[0.4583333333333333, 0.5416666666666666]

In [110]:
# Init SVC model
svm_clf_category = sklearn.svm.SVC(kernel="linear", gamma='auto')

# loop to see every level output
for level in range(len(consider_list) + 1):
    # prepare data
    x = []
    y = []
    for i in train_set_x.index:
        for sentence in nltk.tokenize.sent_tokenize(train_set_x.loc[i, "text"]):
            x.append(get_senti_score(sentence, level))
            y.append(train_set_y.loc[i, "label"])

    # Train model
    svm_clf_category.fit(np.asarray(x), np.asarray(y))

    # test with val set
    vx, vy = [], []
    for i in val_set_x.index:
        for sentence in nltk.tokenize.sent_tokenize(val_set_x.loc[i, "text"]):
            vx.append(get_senti_score(sentence, level))
            vy.append(val_set_y.loc[i, "label"])
    predictions = svm_clf_category.predict(vx)
    vy = np.asarray(vy)

    print("Current level: " + str(level))
    print("precision_score: " + str(precision_score(vy, predictions, average='macro')))
    print("recall_score: " + str(recall_score(vy, predictions, average='macro')))
    print("f1_score: " + str(f1_score(vy, predictions, average='macro')))
    print("accuracy_score: " + str(accuracy_score(vy, predictions)))

Current level: 0
precision_score: 0.31031458762862457
recall_score: 0.364972303952805
f1_score: 0.3331214785197157
accuracy_score: 0.4613616776788279


  _warn_prf(average, modifier, msg_start, len(result))


Current level: 1
precision_score: 0.2985922637005483
recall_score: 0.3438745775714042
f1_score: 0.27188439819437876
accuracy_score: 0.42229244469979893


  _warn_prf(average, modifier, msg_start, len(result))


Current level: 2
precision_score: 0.3268760540814391
recall_score: 0.35327775038033193
f1_score: 0.2709234849496997
accuracy_score: 0.43263430048836543


  _warn_prf(average, modifier, msg_start, len(result))


Current level: 3
precision_score: 0.14823326630278655
recall_score: 0.3333333333333333
f1_score: 0.20520978325710879
accuracy_score: 0.44469979890835964


  _warn_prf(average, modifier, msg_start, len(result))


Current level: 4
precision_score: 0.3326684444558497
recall_score: 0.3667685297545497
f1_score: 0.30379089161185097
accuracy_score: 0.4527434645216892


  _warn_prf(average, modifier, msg_start, len(result))
