In [1]:
import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [2]:
!wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/stopwords.txt

--2024-09-05 14:51:51--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/stopwords.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2488 (2.4K) [text/plain]
Saving to: ‘stopwords.txt’


2024-09-05 14:51:51 (29.2 MB/s) - ‘stopwords.txt’ saved [2488/2488]



In [3]:
!wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/electronics/positive.review

--2024-09-05 14:54:38--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/electronics/positive.review
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1105010 (1.1M) [text/plain]
Saving to: ‘positive.review’


2024-09-05 14:54:38 (16.6 MB/s) - ‘positive.review’ saved [1105010/1105010]



In [5]:
!wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/electronics/negative.review

--2024-09-05 14:59:47--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/nlp_class/electronics/negative.review
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1113512 (1.1M) [text/plain]
Saving to: ‘negative.review’


2024-09-05 14:59:47 (18.9 MB/s) - ‘negative.review’ saved [1113512/1113512]



In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [10]:
wordnet_lemmatizer = WordNetLemmatizer()

stopwords = set(w.rstrip() for w in open("stopwords.txt"))

# note: an alternative source of stopwords
# from nltk.corpus import stopwords
# stopwords.words('english')

positive_reviews = BeautifulSoup(open('positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('negative.review').read())
negative_reviews = negative_reviews.findAll('review_text')

np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

word_index_map = {}
current_index = 0

postive_tokenized = []
negative_tokenized = []
orig_reviews = []

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    postive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1


for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print("len(word_index_map):", len(word_index_map))

len(word_index_map): 10947


In [11]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum()
    x[-1] = label
    return x


In [13]:
N = len(postive_tokenized) + len(negative_tokenized)
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in postive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

np.random.shuffle(data)

X = data[:, :-1]
Y = data[:, -1]

Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Classification rate:", model.score(Xtest, Ytest))

Classification rate: 0.73


In [14]:
threshold = 0.5
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

little 0.9933092891806742
lot 0.7648187167509293
wa -1.7030613902230487
've 0.6703994772798042
time -0.7573678199921955
ha 0.7592361947215114
cable 0.6897554657093986
using 0.6449513832189717
laptop 0.5260366904912673
look 0.5568065852229601
doe -1.1889149614939687
n't -1.9344246409772492
software -0.5214239818048632
you 0.949236399172181
then -1.1634636211914406
try -0.6750439281351115
fit 0.5236231419956094
quality 1.4851175273127308
sound 0.9900293822824214
customer -0.6785056484288957
support -0.9169060076426888
happy 0.6316232621124119
picture 0.5147716793189806
buy -0.9308313734150936
company -0.5316606225157817
easy 1.859644892762086
hour -0.5293873644459413
unit -0.5954020548731874
memory 0.9395817183172597
space 0.6364135416491656
expected 0.5064996390912625
fast 0.9831978214573801
bad -0.8167980553989467
comfortable 0.6698036830405356
bit 0.6110531597575041
price 2.682610014541576
warranty -0.6664035932043131
value 0.5219647192039802
perfect 0.9746374916459574
home 0.59694593