In [1]:
# Download the corpora/stopwords
import nltk

# Uncomment the line below to open the download window
# we use the download window to download the stopwords
# nltk.download()

In [2]:
import pickle
from typing import Generator, List

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from spacy.lang.en import English

In [3]:
import spacy
from spacy.cli import download

# Uncomment the line below to download en_core_web_*
# Download only one

# download("en_core_web_trf")
download("en_core_web_md")

nlp = spacy.load("en_core_web_md")

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [4]:
with open(r"data/interim/model_3/review_classes.pkl", "rb") as input_file:
    review_classes = pickle.load(input_file)

In [5]:
# See the first negative review to check if the dictionary was loaded correctly
review_classes["NEG"][0]

'We went to the hotel directly to book a room. The receptionist at first gave us a really high price then lowered it. When we asked how much it costs in qetales she gave us an 8.2 comission rate to dollars and I knew she was randomly goving us a non existing rate so then she "lowered" it to the real rate. After we agreed that the price includes breakfast the following morning we get to breakfast and received two plain pieces of toast and fruit while theother tables received also eggs and plantines. So I asked if we also get eggs abd plantines and she replied "no". How come we didn\'t get the same breakfast when we paid for it?? Not a good experience at all at the hotel. There are much better places in Antigua to stay.'

In [6]:
positive_reviews = review_classes["POS"]
negative_reviews = review_classes["NEG"]

In [7]:
def sentences_to_words(sentences: List[str]) -> List[List[str]]:
    words = []
    for sentence in sentences:
        # https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
        words.append(simple_preprocess(str(sentence), deacc=True))  # deacc=True elimina la puntuación
    return words

In [8]:
negative_words = sentences_to_words(negative_reviews)
positive_words = sentences_to_words(positive_reviews)

# Check the first 10 words of the first negative review
index = 0
if len(negative_words[index]) > 10:
    print(negative_words[index][0:10])

['we', 'went', 'to', 'the', 'hotel', 'directly', 'to', 'book', 'room', 'the']


In [9]:
def remove_stopwords(documents: List[List[str]]) -> List[List[str]]:
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords.words('english')]
            for doc in documents]

In [10]:
positive_words_wo_stopwords = remove_stopwords(positive_words)
negative_words_wo_stopwords = remove_stopwords(negative_words)

# Statistics for the first negative review . . .
print("Statistics for the first negative review")
print("- Number of words with stopwords: ", len(negative_words[0]))
print("- Number of words without stopwords: ", len(negative_words_wo_stopwords[0]))

Statistics for the first negative review
- Number of words with stopwords:  132
- Number of words without stopwords:  71


In [11]:
def learn_bigrams(documents: List[List[str]]) -> List[List[str]]:
    # We learn bigrams
    # https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.Phrases
    bigram = Phrases(documents, min_count=5, threshold=10)

    # we reduce the bigram model to its minimal functionality
    bigram_mod = Phraser(bigram)

    # we apply the bigram model to our documents
    return bigram_mod


def create_bigrams(bigram_model, documents: List[List[str]]):
    return [bigram_model[doc] for doc in documents]

In [12]:
bigram_model = learn_bigrams(negative_words_wo_stopwords + positive_words_wo_stopwords)
negative_words_wo_stopwords_bigrams = create_bigrams(bigram_model, negative_words_wo_stopwords)

In [13]:
# Statistics for the first negative review . . .
print("Statistics for the first negative review")
print("- Number of words with stopwords: ", len(negative_words[0]))
print("- Number of words without stopwords and with bigrams: ", len(negative_words_wo_stopwords_bigrams[0]))

Statistics for the first negative review
- Number of words with stopwords:  132
- Number of words without stopwords and with bigrams:  68


In [14]:
def lemmatization(nlp: English, texts: List[List[str]], allowed_postags: List = None) -> List[List[str]]:
    if allowed_postags is None:
        allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']

    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
negative_words_wo_stopwords_bigrams_pos = lemmatization(nlp, negative_words_wo_stopwords_bigrams)

In [16]:
# Statistics for the first negative review . . .
print("Statistics for the first negative review")
print("- Number of words without stopwords and with bigrams: ", len(negative_words_wo_stopwords_bigrams[0]))
print("- Number of words without stopwords, with bigrams and lemmnatization: ", len(negative_words_wo_stopwords_bigrams_pos[0]))

Statistics for the first negative review
- Number of words without stopwords and with bigrams:  68
- Number of words without stopwords, with bigrams and lemmnatization:  61


In [17]:
def tokenize(documents: List[str], bigram_model) -> List[List[str]]:

    document_words = sentences_to_words(documents)
    document_words = remove_stopwords(document_words)
    document_words = create_bigrams(bigram_model, document_words)
    document_words = lemmatization(nlp, document_words)

    return document_words

In [18]:
positive_words = tokenize(positive_reviews, bigram_model)
negative_words = tokenize(negative_reviews, bigram_model)

In [19]:
print(sentences_to_words(negative_reviews)[0])
print(negative_words[0])

['we', 'went', 'to', 'the', 'hotel', 'directly', 'to', 'book', 'room', 'the', 'receptionist', 'at', 'first', 'gave', 'us', 'really', 'high', 'price', 'then', 'lowered', 'it', 'when', 'we', 'asked', 'how', 'much', 'it', 'costs', 'in', 'qetales', 'she', 'gave', 'us', 'an', 'comission', 'rate', 'to', 'dollars', 'and', 'knew', 'she', 'was', 'randomly', 'goving', 'us', 'non', 'existing', 'rate', 'so', 'then', 'she', 'lowered', 'it', 'to', 'the', 'real', 'rate', 'after', 'we', 'agreed', 'that', 'the', 'price', 'includes', 'breakfast', 'the', 'following', 'morning', 'we', 'get', 'to', 'breakfast', 'and', 'received', 'two', 'plain', 'pieces', 'of', 'toast', 'and', 'fruit', 'while', 'theother', 'tables', 'received', 'also', 'eggs', 'and', 'plantines', 'so', 'asked', 'if', 'we', 'also', 'get', 'eggs', 'abd', 'plantines', 'and', 'she', 'replied', 'no', 'how', 'come', 'we', 'didn', 'get', 'the', 'same', 'breakfast', 'when', 'we', 'paid', 'for', 'it', 'not', 'good', 'experience', 'at', 'all', 'at',

In [20]:
with open(r"data/interim/model_3/positive_words.pkl", "wb") as output_file:
    pickle.dump(positive_words, output_file)

with open(r"data/interim/model_3/negative_words.pkl", "wb") as output_file:
    pickle.dump(negative_words, output_file)

In [21]:
with open(r"data/interim/model_3/bigram_model.pkl", "wb") as output_file:
    pickle.dump(bigram_model, output_file)