In [1]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

### Load Data

In [2]:
import json

file_name = "./data/Books_small.json"

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[5].sentiment

'POSITIVE'

### Prep Data

In [3]:
from sklearn.model_selection import train_test_split

# 33% of the review will be used for testing and give rnadom state to get smae thing everytime
training, test = train_test_split(reviews, test_size=0.33, random_state = 42)

In [4]:
print(training[0].text)

Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.


In [5]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

### Bag Of Words Vectorization

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

# Alternative way for fit/transform
#vectorizer.fir(train_x)
#train_x_vectors = vectorizer.trnasform(train_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.
[[0 0 0 ... 0 0 0]]


### Classification

#### Linear SVM

In [22]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

print(test_x[0])
#test_x_vectors[0]

clf_svm.predict(test_x_vectors[0])

Every new Myke Cole book is better than the last, and this is no exception. If you haven't read the Shadow Ops series before start with Control Point, but go ahead and order Fortress Frontier and Breach Zone as well - you're going to want them.


array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayess

In [28]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

### Logistic Regression