In [1]:
import numpy as np
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

### Read the information from the files

In [2]:
with open("data/positive.review") as f:
    soup = BeautifulSoup(f.read(), "lxml")

positive_reviews = soup.find_all("review_text")

with open("data/negative.review") as f:
    soup = BeautifulSoup(f.read(), "lxml")
negative_reviews = soup.find_all("review_text")

In [3]:
negative_stopwords = {"no", "nor", "not", "off"}.union(
    {w for w in stopwords.words("english") if w.endswith("'t")}
)
new_stopwords = set(stopwords.words("english")).difference(negative_stopwords)

### Function that tokenize and clean the reviews

In [4]:
stemmer = PorterStemmer()
def custom_tokenizer(review):
    review = review.lower()
    tokens = word_tokenize(review)
    tokens = [t for t in tokens if t not in new_stopwords] # remove stopwords
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens

### Create a `word_index_map` to store an index for all the tokens, and create `positive_tokens` and `negative_tokens`

In [5]:
positive_tokens = []
negative_tokens = []
word_index_map = {}
i = 0
for review in positive_reviews:
    tokens = custom_tokenizer(review.text)
    positive_tokens.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = i
            i += 1

for review in negative_reviews:
    tokens = custom_tokenizer(review.text)
    negative_tokens.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = i
            i += 1

### Populate the `data` numpy array to store features and labels

In [6]:
d = len(word_index_map)
n = len(positive_tokens) + len(negative_tokens)
data = np.zeros((n, d + 1))

In [7]:
i = 0
for token in positive_tokens:
    for word in token:
        j = word_index_map[word]
        data[i, j] += 1
    data[i, -1] = 1
    i += 1

for token in negative_tokens:
    for word in token:
        j = word_index_map[word]
        data[i, j] += 1
    data[i, -1] = 0
    i += 1

### Shuffle the data and separate the features and labels

In [8]:
np.random.seed(0)
np.random.shuffle(data)
X = data[:, :-1]
y = data[:, -1]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
clf = LogisticRegression()
X_train = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [11]:
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(cm)
print(accuracy)

[[158  34]
 [ 36 172]]
0.825
