In [10]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__ (self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

### Load Data

In [15]:
import json

reviews = []
with open("books_small.json") as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"], review["overall"]))


### Prepare Data

In [19]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [26]:
# X er ord
# Y er sentiment
# Så hva er sannsynlighetet for Y gitt X ord i en setning

In [34]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in training]
test_y = [x.sentiment for x in training]


### Bag of words

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x) # Fit and Transform

test_x_vectors = vectorizer.transform(test_x)

### Classifiers

#### Linear SVM

In [68]:
from sklearn import svm

clf_svm = svm.SVC(kernel="linear")
clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [63]:
from sklearn.tree import DecisionTreeClassifier as DTC

clf_dec = DTC()
clf_dec.fit(train_x_vectors, train_y)

array(['POSITIVE'], dtype='<U8')

#### MultinomialNB Naive Bayes

In [75]:
from sklearn.naive_bayes import MultinomialNB
clf_mnb = MultinomialNB()
clf_mnb.fit(train_x_vectors, train_y)



clf_mnb.predict(train_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [72]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])




array(['POSITIVE'], dtype='<U8')

### Evaluation

#### Average

In [81]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_mnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, train_y))

1.0
0.8865671641791045
0.8865671641791045
0.9955223880597015


#### F1 Score

In [91]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, 
               labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, 
               labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

print(f1_score(test_y, clf_mnb.predict(test_x_vectors), average=None, 
               labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, 
               labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

[1. 1. 1.]
[0.93559322 0.59405941 0.40677966]
[0.93559322 0.59405941 0.40677966]
