In [19]:
!python --version

Python 3.8.5


### Data Class

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

In [None]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        

### Load Data

In [None]:
import json

file_name = './Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
print(reviews[0].sentiment)
print(reviews[0].text)        

### Prep Data

In [None]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.20)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

In [None]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

#### Bag of words vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

# print(train_x[0])
# print(train_x_vectors[0].toarray())
print(train_x_vectors[11].shape)

In [None]:
import pickle
pickle.dump(vectorizer, open("./models/tfidf.pickle", "wb"))

In [None]:
with open(f'./models/tfidf.pickle', 'rb') as f:
    vectorizer = pickle.load(f)
    train_x_vectors = vectorizer.transform(train_x)
    test_x_vectors = vectorizer.transform(test_x)
    print(train_x_vectors[11].shape)

## Classification

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[5])


## Evaluation

In [None]:
# Mean Accuracy
print(clf_log.score(test_x_vectors, test_y))

In [None]:
# F1 Scores
from sklearn.metrics import f1_score
f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])




In [None]:
test_set =["good book"]
test_set_vector = vectorizer.transform(test_set)
clf_log.predict(test_set_vector)

In [None]:
print(clf.score(test_x_vectors, test_y))

## Saving Model

#### Save model

In [None]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf_log, f)

#### Load model

In [None]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [None]:
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])