# Pre-Processing for Models

### Import relevant libraries

In [None]:
# import relevant libraries

# for data manipulation
import numpy as np
import pandas as pd

# for NLP
import nltk
from nltk import pos_tag
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import defaultdict
import re

# for counter
from tqdm import tqdm

### Import data before pre-processing

In [None]:
# import the data
df = pd.read_csv('df_reviews_final_binary.csv')

In [None]:
# check shape
df.shape

(660645, 14)

In [None]:
# are there reviews with no words
df['review_comments'].isna().any()

False

In [None]:
# check the ratings
df['review_rating'].value_counts()

review_rating
positive    654007
negative      6638
Name: count, dtype: int64

In [None]:
# assign review column to a new dataframe
data = df['review_comments']

### Pre-processing of the data

##### Already included pre-processing steps

In the first pre-processing script, the removal of special characters and lowercasing was considered. Also using named entity recognition (NER), names and locations have been filtered as these are highly diverse and not contributing much to the sentiment. Also, this reduces the final resulting feature space of the document-term matrix and eases computation. 

Further pre-processing steps for the models is lemmatization, stemming, and stopwords removal to further decrease the feature space of the document-term matrix. We take unigrams, bigrams, and trigrams into account.

##### Tokenization (list of lists)

In [None]:
# if tokenizer not installed yet
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ucloud/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# tokenize with list comprehension
tokenized_texts = [word_tokenize(text) for text in tqdm(data)]

100%|██████████| 660645/660645 [01:47<00:00, 6145.12it/s]


In [None]:
tokenized_texts[0]

['great',
 'location',
 'close',
 'to',
 'main',
 'public',
 'transport',
 'easy',
 'to',
 'get',
 'anywhere',
 'and',
 'get',
 'back',
 'super',
 'easy',
 'check',
 'in',
 'and',
 'out',
 'very',
 'responsive',
 'clean',
 'and',
 'well',
 'stocked',
 'thanks']

##### Lemmatization of words

In [None]:
# if lemmatizer not installed yet
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ucloud/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# if pos tagger not installed yet
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ucloud/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# initiate lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
lemmatized_texts = []

# loop 
for texts in tqdm(tokenized_texts):
    tokens = []
    for token in texts:
        # pos_tag is needed for lemmatization
        tag = pos_tag([token])[0][1][0].lower()
        if tag == 'j':
            tag = 'a'
        # if word belongs to certain class, lemmatization is considered
        if tag in ('n','v','r','a'):
            tokens.append(lemmatizer.lemmatize(token, tag))
        # if not append word without processing
        else:
            tokens.append(token)
    lemmatized_texts.append(tokens)

100%|██████████| 660645/660645 [44:13<00:00, 248.99it/s]  


In [None]:
lemmatized_texts[0]

['great',
 'location',
 'close',
 'to',
 'main',
 'public',
 'transport',
 'easy',
 'to',
 'get',
 'anywhere',
 'and',
 'get',
 'back',
 'super',
 'easy',
 'check',
 'in',
 'and',
 'out',
 'very',
 'responsive',
 'clean',
 'and',
 'well',
 'stock',
 'thanks']

##### Stemming of words

In [None]:
# initiate stemmer
stemmer = PorterStemmer()

In [None]:
# apply stemming on the words
stemmed_texts = [[stemmer.stem(token) for token in token_list] for token_list in tqdm(lemmatized_texts)]

100%|██████████| 660645/660645 [05:38<00:00, 1951.73it/s]


##### Remove stopwords

In [None]:
# if stopwords not installed yet
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ucloud/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# save stopwords
stopwords = stopwords.words('english')

In [None]:
# look at some stopwords
print(stopwords[:5])

['i', 'me', 'my', 'myself', 'we']


In [None]:
# some stopwords that can be interpreted negatively should be kept (negators)
negative_words = ['not','no','can','don','won','shouldn','couldn','wouldn','doesn','isn','aren','wasn','weren','but','nor']

In [None]:
# remove stopwords with list comprehension but keep the negative words
final_tokens = [[token for token in token_list if (token not in stopwords or token in negative_words)] for token_list in tqdm(stemmed_texts)]

100%|██████████| 660645/660645 [00:47<00:00, 13874.46it/s]


In [None]:
final_tokens[0]

['great',
 'locat',
 'close',
 'main',
 'public',
 'transport',
 'easi',
 'get',
 'anywher',
 'get',
 'back',
 'super',
 'easi',
 'check',
 'veri',
 'respons',
 'clean',
 'well',
 'stock',
 'thank']

##### Remove rare words (to reduce dimension of document-term matrix later)

In [None]:
# count appearance of words and put it into dictionary

word_counts = defaultdict(int)

for tokens in final_tokens:
    for token in tokens:
        word_counts[token] += 1

In [None]:
# check amount of vocabulary
len(word_counts.keys())

53575

In [None]:
# function that removes very rare words
def remove_infrequent_words(tokenized_documents, min_frequency=10):
    word_counts = defaultdict(int)

    # count word frequencies
    for tokens in tokenized_documents:
        for token in tokens:
            word_counts[token] += 1

    # filter infrequent words
    frequent_words = set()
    for word, count in word_counts.items():
        if count >= min_frequency:
            frequent_words.add(word)

    # remove rare words
    filtered_documents = []
    for tokens in tokenized_documents:
        filtered_tokens = [token for token in tokens if token in frequent_words]
        filtered_documents.append(filtered_tokens)

    return filtered_documents

In [None]:
# filter with the function
filtered = remove_infrequent_words(final_tokens, 10)

In [None]:
filtered[0]

['great',
 'locat',
 'close',
 'main',
 'public',
 'transport',
 'easi',
 'get',
 'anywher',
 'get',
 'back',
 'super',
 'easi',
 'check',
 'veri',
 'respons',
 'clean',
 'well',
 'stock',
 'thank']

In [None]:
# check how many unique words there are
word_counts = defaultdict(int)

for tokens in filtered:
    for token in tokens:
        word_counts[token] += 1

In [None]:
# check amount of vocabulary
len(word_counts.keys())

10839

##### Extract Unigrams, Bigrams and Trigrams 

In [None]:
# function to extract ngrams from a list of tokens
def extract_ngrams(tokens, n=1):
    return ['_'.join(gram) for gram in ngrams(tokens, n)]

# extracting unigrams, bigrams, and trigrams
ngrams_list = []
for text in tqdm(filtered):
    text_ngrams = []
    text_ngrams += extract_ngrams(text, n=1)
    text_ngrams += extract_ngrams(text, n=2)
    text_ngrams += extract_ngrams(text, n=3)
    ngrams_list.append(text_ngrams)

100%|██████████| 660645/660645 [00:16<00:00, 40344.88it/s]


In [None]:
ngrams_list[0]

['great',
 'locat',
 'close',
 'main',
 'public',
 'transport',
 'easi',
 'get',
 'anywher',
 'get',
 'back',
 'super',
 'easi',
 'check',
 'veri',
 'respons',
 'clean',
 'well',
 'stock',
 'thank',
 'great_locat',
 'locat_close',
 'close_main',
 'main_public',
 'public_transport',
 'transport_easi',
 'easi_get',
 'get_anywher',
 'anywher_get',
 'get_back',
 'back_super',
 'super_easi',
 'easi_check',
 'check_veri',
 'veri_respons',
 'respons_clean',
 'clean_well',
 'well_stock',
 'stock_thank',
 'great_locat_close',
 'locat_close_main',
 'close_main_public',
 'main_public_transport',
 'public_transport_easi',
 'transport_easi_get',
 'easi_get_anywher',
 'get_anywher_get',
 'anywher_get_back',
 'get_back_super',
 'back_super_easi',
 'super_easi_check',
 'easi_check_veri',
 'check_veri_respons',
 'veri_respons_clean',
 'respons_clean_well',
 'clean_well_stock',
 'well_stock_thank']

##### Remove rare words after unigram, bigram, and trigram extraction again (to reduce dimension of document-term matrix later)

In [None]:
# count appearance of words and put it into dictionary

word_counts = defaultdict(int)

for tokens in ngrams_list:
    for token in tokens:
        word_counts[token] += 1

In [None]:
# check amount of vocabulary
len(word_counts.keys())

8214457

In [None]:
# filter with the function
final = remove_infrequent_words(ngrams_list, 10)

In [None]:
final[0]

['great',
 'locat',
 'close',
 'main',
 'public',
 'transport',
 'easi',
 'get',
 'anywher',
 'get',
 'back',
 'super',
 'easi',
 'check',
 'veri',
 'respons',
 'clean',
 'well',
 'stock',
 'thank',
 'great_locat',
 'locat_close',
 'close_main',
 'main_public',
 'public_transport',
 'transport_easi',
 'easi_get',
 'get_anywher',
 'get_back',
 'back_super',
 'super_easi',
 'easi_check',
 'check_veri',
 'veri_respons',
 'respons_clean',
 'clean_well',
 'well_stock',
 'stock_thank',
 'great_locat_close',
 'locat_close_main',
 'main_public_transport',
 'public_transport_easi',
 'transport_easi_get',
 'easi_get_anywher',
 'super_easi_check',
 'easi_check_veri',
 'check_veri_respons',
 'veri_respons_clean',
 'clean_well_stock',
 'well_stock_thank']

In [None]:
# check how many unique words there are
word_counts = defaultdict(int)

for tokens in final:
    for token in tokens:
        word_counts[token] += 1

In [None]:
# check amount of vocabulary
len(word_counts.keys())

302055

##### Create dataframe

In [None]:
# combine the final_tokens list and sentiment labels
df = pd.DataFrame(
    {'tokens': final,
     'sentiment': df['review_rating']
    })

In [None]:
# check dataframe
df.head()

Unnamed: 0,tokens,sentiment
0,"[great, locat, close, main, public, transport,...",positive
1,"[famili, four, thi, flat, can, accommod, perso...",positive
2,"[place, wonder, plenti, room, us, help, right,...",positive
3,"[great, locat, truli, onli, coupl, stop, away,...",positive
4,"[great, place, perfect, weekend, not, squar, a...",positive


In [None]:
# check that there are no empty tokens lists
df[df['tokens']=='[]']

Unnamed: 0,tokens,sentiment


### Save / load the data after processing  

In [None]:
# save df as csv
df.to_csv('df_tokenized.csv', index=False)

In [None]:
# load data as df
# df = pd.read_csv('df_tokenized.csv', index_col=[0])