# Real-World Python Machine Learning Tutorial w/ Scikit Learn (sklearn basics, NLP, classifiers, etc)

## Installation

pipenv install sklearn 

## Import

In [1]:
import json
import random

## Classes

In [2]:
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        print(len(negative))
        print(len(positive))
        
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
    

## Load Data

In [3]:
# file_name = './data/sentiment/Books_small.json'
file_name = './data/sentiment/Books_small_10000.json'

reviews = []



with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        

## Prep Data

#### Split to training and test

In [4]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
train_container.evenly_distribute()
test_container = ReviewContainer(test)
test_container.evenly_distribute()

436
5611
208
2767


In [5]:
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

#### Bag of Words 

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

## Classification

### Linear SVM

In [25]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [27]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.todense(), train_y)

clf_gnb.predict(test_x_vectors[0].todense())



array(['NEGATIVE'], dtype='<U8')

### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors.todense(), train_y)

clf_log.predict(test_x_vectors[0].todense())



array(['NEGATIVE'], dtype='<U8')

## Evaluation

### These are MEAN accuracy

In [29]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.todense(), test_y))
print(clf_log.score(test_x_vectors.todense(), test_y))

0.8076923076923077
0.6418269230769231
0.6610576923076923
0.8052884615384616




### F1 Scores

#### (Which is what we mainly care about)

In [30]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_gnb.predict(test_x_vectors.todense()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_log.predict(test_x_vectors.todense()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

[0.80582524 0.         0.80952381]
[0.63390663 0.         0.64941176]
[0.65693431 0.         0.66508314]
[0.80291971 0.         0.80760095]


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


##### The models so far are very good for positive but aweful for neutral and negative



In [31]:
print(train_y[0:5])
train_y.count(Sentiment.NEGATIVE)

['POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE']


436

The train data is biased through postiive data  
We'll use a bigger dataset (CHANGED in the Load Data section above)  
And now work to evenly distribute negative and positive reviews in out train and test sets

The mean score has gotten worse  
But the F1 scores has gotten better

#### Test with our data

In [34]:
test_set = ['worst Ive ever seen', 'wowww', 'it is ok I guess']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['NEGATIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

#### We now try to improve it with tfidf vectorizer
The thing with CountVector is that it weighs all words equally regardless of meaning

### Tuning our model (with Grid Search)
It enables to test the models with multiple parameters
CV stands for cross validating

In [42]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'kernel': ('linear', 'rbf'),
    'C': (1,4,6,16,32)
}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)
print(f"Best Params: {clf.best_params_}")

Best Params: {'C': 4, 'kernel': 'rbf'}


In [43]:
print(f"Mean Score: {clf.score(test_x_vectors, test_y)}")
print(f"F1 Score: {f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])}")

Mean Score: 0.8197115384615384
F1 Score: [0.80582524 0.80952381]


## Saving the Model

In [46]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

### Load the Model

In [48]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)
loaded_clf.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')