# Sentiment Analysis

In [2]:
# format code
# pip install nb_black
%load_ext nb_black

<IPython.core.display.Javascript object>

In [59]:
import csv
import random
import pandas as pd
import pickle
import warnings
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_validate

<IPython.core.display.Javascript object>

## SentiWS
SentimentWortschatz, or SentiWS for short, is a publicly available German-language resource for sentiment analysis, opinion mining etc. It lists positive and negative polarity bearing words weighted within t    he interval of [-1; 1] plus their part of speech tag, and if applicable, their inflections. The current version of SentiWS (v2.0) contains around 1,650 positive and 1,800 negative words, which sum up to around 16,000 positive and around 18,000 negative word forms incl. their inflections, respectively. It not only contains adjectives and adverbs explicitly expressing a sentiment, but also nouns and verbs implici    tly containing one.

R. Remus, U. Quasthoff & G. Heyer: SentiWS - a Publicly Available German-language Resource for Sentiment Analysis.
In: Proceedings of the 7th International Language Ressources and Evaluation (LREC'10), 2010

In [4]:
# <Word>|<POS tag> \t <Polarity weight> \t <Infl_1>,...,<Infl_k> \n

<IPython.core.display.Javascript object>

In [5]:
def write_back(triples):
    with open("data/sentiWS/train.csv", "w", newline="") as f:
        the_writer = csv.writer(f)
        for triple in triples:
            the_writer.writerow(list(triple))


def clean(line):
    line = line.replace("\n", "").replace(" ", "").strip().lower()

    if line == "":
        return []

    line = line.split("\t")
    line[0] = line[0].split("|")
    words = []
    try:
        words = line[2].split(",")
    except:
        if len(line) == 3:
            if line[2] != "":
                words.append(line[2])

    words.append(line[0][0])
    triples = []  # (<Word>,<POS tag>,<Polarity weight>)
    for word in words:
        triples.append((word, line[0][1], line[1]))
    return triples


triples = []
with open("data/sentiWS/sentiWS_negative.txt", "r") as negative:
    for line in negative.readlines():
        clean_words = clean(line)
        triples = triples + clean_words
with open("data/sentiWS/sentiWS_positive.txt", "r") as positive:
    for line in positive.readlines():
        clean_words = clean(line)
        triples = triples + clean_words

write_back(triples)

<IPython.core.display.Javascript object>

In [37]:
class Sentiment:
    POSITIVE = "POSITIVE"
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"


class WordCase:
    def __init__(self, word, score):
        self.word = word
        self.score = score
        self.sentiment = self.get_sentiment_by_score()

    def __str__(self):
        return f"{self.word},{self.sentiment}"

    def get_sentiment_by_score(self):
        if self.score <= 0.0048 and self.score >= -0.0048:
            return Sentiment.NEUTRAL
        elif self.score < -0.0048:
            return Sentiment.NEGATIVE
        elif self.score > 0.0048:
            return Sentiment.POSITIVE


class WordCaseContainer:
    def __init__(self, word_cases):
        self.word_cases = word_cases

    def get_word(self):
        return [x.word for x in self.word_cases]

    def get_sentiment(self):
        return [x.sentiment for x in self.word_cases]

    def evenly_distribute(self):
        negative = list(
            filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.word_cases)
        )
        positive = list(
            filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.word_cases)
        )
        positive_shrunk = positive[: len(negative)]
        negative_shrunk = negative[: len(positive)]
        self.word_cases = negative_shrunk + positive_shrunk
        random.shuffle(self.word_cases)

<IPython.core.display.Javascript object>

In [26]:
with open("data/sentiWS/train.csv", "r") as f:
    f_reader = csv.reader(f)
    word_cases = []
    for line in f:
        line = line.replace("\n", "")
        line = line.split(",")
        try:
            word_cases.append(WordCase(line[0], float(line[2])))
        except:
            pass

<IPython.core.display.Javascript object>

In [27]:
print(word_cases[200])

aufschreien,NEUTRAL


<IPython.core.display.Javascript object>

In [47]:
training, test = train_test_split(word_cases, test_size=0.2, random_state=100)
train_container = WordCaseContainer(training)
test_container = WordCaseContainer(test)

<IPython.core.display.Javascript object>

In [48]:
train_container.evenly_distribute()
train_x = train_container.get_word()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_word()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

4934
4934


<IPython.core.display.Javascript object>

In [49]:
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

sonnigsten
[[0. 0. 0. ... 0. 0. 0.]]


<IPython.core.display.Javascript object>

In [50]:
warnings.filterwarnings("ignore", category=FutureWarning)

clfs_with_names = [
    ("Random Forest", RandomForestClassifier()),
    ("Logistic Regression", LogisticRegression()),
    ("SVM", SVC()),
    ("Naive Bayes", BernoulliNB()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("k-nearest neighbor", KNeighborsClassifier()),
]

for name, clf in clfs_with_names:
    clf.fit(train_x_vectors, train_y)
    print(name, ":", clf.score(test_x_vectors, test_y))
    print(clf.predict(test_x_vectors[0]))

Random Forest : 0.5032546786004882
['POSITIVE']
Logistic Regression : 0.5061025223759154
['NEGATIVE']
SVM : 0.5061025223759154
['NEGATIVE']
Naive Bayes : 0.5061025223759154
['NEGATIVE']
Decision Tree : 0.5061025223759154
['NEGATIVE']
k-nearest neighbor : 0.5016273393002441
['NEGATIVE']


<IPython.core.display.Javascript object>

In [55]:
parameters = {"kernel": ("linear", "rbf"), "C": (1, 4, 8, 16, 32)}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

<IPython.core.display.Javascript object>

In [56]:
print(clf.score(test_x_vectors, test_y))

0.5061025223759154


<IPython.core.display.Javascript object>

In [60]:
pickle.dump(clf, open("data/models/sentiment_classifier.pkl", "wb"))

<IPython.core.display.Javascript object>

### load

In [61]:
loaded_clf = pickle.load(open("data/models/sentiment_classifier.pkl", "rb"))
loaded_clf.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

<IPython.core.display.Javascript object>