# Sentiment Analyzer

In [16]:
#import libraries and modules
import nltk 
import numpy as np

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

#Lemmatizer - transform words into base form - jumping = jump
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

#load reviews 
positive_reviews = BeautifulSoup(open('positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

In [17]:
#there are more positive reviews than negative reviews => class imbalance => shuffle positive and delete to have same range as negative 
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [21]:
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

positive_tokenized = []
negative_tokenized = []


#create word map index: 
word_map_index = {}
current_index = 0 
for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens: 
        if token not in word_map_index: 
            word_map_index[token] = current_index
            current_index +=1
            
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens: 
        if token not in word_map_index: 
            word_map_index[token] = current_index
            current_index +=1

In [25]:
# take each token and create an array of numbers 
def tokens_to_vector(tokens,label):
    x = np.zeros(len(word_map_index)+1)
    for t in tokens: 
        i = word_map_index[t]
        x[i] +=1
    x = x/x.sum()
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)

data = np.zeros((N, len(word_map_index) + 1))
i = 0 
for tokens in positive_tokenized: 
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i +=1
for tokens in negative_tokenized: 
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i +=1

np.random.shuffle(data)
X = data[:, :-1]
Y = data[:, -1]

X_train = X[:-100,]
Y_train = Y[:-100,]
X_test = X[-100:,]
Y_test = Y[-100:,]

model = LogisticRegression()
model.fit(X_train,Y_train)
print("Classification rate: ", model.score(X_test, Y_test))
 
    



Classification rate:  0.69


In [28]:
threshold = 0.5
for word, index in word_map_index.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold: 
        print(word, weight)

then -1.0530318332939599
try -0.6655543298093445
month -0.7304452979295445
customer -0.7028073495823965
wa -1.7141353233052259
bit 0.6148295650763265
price 2.666124444699022
little 0.9457783276909697
n't -1.9983609003381035
happy 0.6456968393719918
cable 0.6746659302892987
quality 1.413367532659135
perfect 1.0018262461800205
video 0.5539990353420746
've 0.7955454799005883
support -0.906694175945994
week -0.6594152536382365
sound 1.2572562124841644
comfortable 0.6624848324036833
doe -1.3139065428054573
card -0.632126403949423
lot 0.551475428183394
love 1.2264589545125453
ha 0.6869590230181629
speaker 0.9234256506862881
poor -0.7701178775649423
bad -0.7748307656174696
you 0.897126650038981
buy -0.9265717057038769
time -0.6047039664102574
hour -0.5966367395050187
using 0.7513983634825856
home 0.5121466390874151
space 0.6080725836080594
unit -0.9127356792965857
easy 1.8029240888552984
excellent 1.3827570753875917
recommend 0.6545909388674771
pretty 0.7231977722314218
memory 0.8049618563259