In [0]:
#Sentiment analyzer with nltk for dealing with tokenization of words and 
#Logistic Regression for classification; Other techniques/libraries include
#NLTK and Beautiful Soup.
#Uses Yelp reviews for input data.
#
#Based on LazyProgrammer's Sentiment Analysis @ 
#https://github.com/lazyprogrammer/machine_learning_examples/blob/master/nlp_class/sentiment.py
#Stopwords file from:
#http://www.lextek.com/manuals/onix/stopwords1.html

In [0]:
#import all the libraries
import nltk
#nltk.download("popular") #for Google Colab

import numpy as np

from future.utils import iteritems
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup #to build the word tree as a nested data structure
from google.colab import files

In [0]:
#this line is only necessary if you are using colaboratory
#data = files.upload()
#delete files uploaded with colab:
#!rm negative.review
#!rm positive.review


In [0]:
#Lemmatizer to root words to basic forms
wordnet_lemmatizer = WordNetLemmatizer()

In [0]:
#stop words are articles and other components of speech that don't really add context
#want to remove the stopwords
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [0]:
#Use BeautifulSoup to work through the word trees
positive_reviews = BeautifulSoup(open('positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

In [0]:
#Use BeautifulSoup to work through the word trees
negative_reviews = BeautifulSoup(open('negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

In [0]:
#see how big these soups are


In [0]:
#fix the positive_review soup to be the same length as the negative, 
#but randomize entries first
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [0]:
#Tokenizer
def my_tokenizer(s):
    s = s.lower() #consider lowercase versions of words
    tokens = nltk.tokenize.word_tokenize(s) #use NLTKs tokenizer
    tokens = [t for t in tokens if len(t) > 2] #get rid of short words
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] #lemmatize the current word
    tokens = [t for t in tokens if t not in stopwords] # cut the stopwords
    return tokens

In [0]:
#count up the new words, when observed
word_index_map = {} #make an empty array
current_index = 0 #start at position 0

In [0]:
#backup our tokenized arrays
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

In [0]:
for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [0]:
#take each token and build numerical array
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [0]:
N = len(positive_tokenized) + len(negative_tokenized)

#assign zero array
data = np.zeros((N, len(word_index_map) + 1))

#counter to track example #
i = 0

#now loop through the positives
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

#now loop through the negatives
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [0]:
#shuffle everything again
np.random.shuffle(data)

In [0]:
X = data[:, :-1] #all the rows except last column
Y = data[:, -1] #just the last column

In [0]:
#split sets at position -100, the last 100 columns
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [120]:
#build the model
model = LogisticRegression()

#fit the model to the training data
model.fit(Xtrain, Ytrain)

#print the results!
print("Classification rate:", model.score(Xtest, Ytest))



Classification rate: 0.73


In [0]:
#classification accuracy of 73% match rate
#maybe logistic regression is not the best way to go with this classification task
#could try linear regression or a binary classification method

In [128]:
#so now going the threshold this to see what words are outside the threshold range
threshold = 0.5
for word, index in iteritems(word_index_map):
  weight = model.coef_[0][index]
  if weight > threshold or weight < -threshold:
    print (word, weight)

great 2.5522660430882635
for 1.9855946653860324
the -0.8599716927582726
you 0.8217295415341865
get -0.7471046075893204
back -1.0665135987144818
and 1.3308624250814878
price 1.6422856336829266
wa -0.9589979474814412
with 1.083209277692315
will -0.6813811815821088
memory 0.5864286073937015
they -0.6900979322823695
but -0.5922336796571961
perfect 0.5710921849022103
good 1.441334510170496
not -3.164017889934406
that -0.5724662221870235
sound 0.7803251216644143
excellent 0.8604058721658995
well 0.6605882127174212
used 0.6533468340810541
doe -0.7033137039239062
thing -0.5985298054189825
highly 0.5969834638643131
use 1.067658764804554
time -0.5108544663048894
out -0.8101020971103214
did -0.6049648478617873
n't -1.4331653139847305
are 1.0683896979642196
best 0.6725147513938603
very 1.023435954020525
buy -0.6402108370315929
then -0.6540882844032528
quality 0.983810343395668
after -1.078835384042973
cable 0.596328946973414
speaker 0.544350808841778
fast 0.5076075129105997
easy 0.9444163555076407