# 2.9 Practical

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pandas as pd

### Load data

In [None]:
data = pd.read_csv("tripadvisor_hotel_reviews.csv")

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data['Review'][0]

### Lowercase

In [None]:
data['review_lowercase'] = data['Review'].str.lower()

In [None]:
data.head()

### Stop word removal

In [None]:
en_stopwords = stopwords.words('english')
en_stopwords.remove("not")

In [None]:
data['review_no_stopwords'] = data['review_lowercase'] \
                                .apply(lambda x: ' ' \
                                .join([word for word \
                                in x.split() if word \
                                not in (en_stopwords)]))

In [None]:
data.head()

In [None]:
data['review_no_stopwords'][0]

### Punctuation

We want to remove punctation but we notice we have a few reviews with the symbol * instead of the word star. We want to keep this as it might add meaning to the review. We can do this using what we learned about regular expressions.

In [None]:
data['review_no_stopwords_no_punct'] = data \
                                    .apply(lambda x: \
                                    re.sub(r"[*]", \
                                    "star", \
                                    x['review_no_stopwords'] \
                                    ), axis=1)

In [None]:
data.head()

In [None]:
data['review_no_stopwords_no_punct'] = data. \
                                    apply(lambda x: \
                                    re.sub(r"([^\w\s])", \
                                    "", \
                                    x['review_no_stopwords_no_punct'] \
                                    ), axis=1)

In [None]:
data.head()

### Tokenizing

In [None]:
data['tokenized'] = data.apply(lambda x: \
                               word_tokenize( \
                               x['review_no_stopwords_no_punct'] \
                               ), axis=1)

In [None]:
data.head()

In [None]:
data['tokenized'][0]

### Stemming

In [None]:
ps = PorterStemmer()

In [None]:
data["stemmed"] = data["tokenized"] \
                  .apply(lambda tokens: \
                  [ps.stem(token) \
                   for token in tokens])

In [None]:
data.head()

In [None]:
data['stemmed'][0]

### Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
data["lemmatized"] = data["tokenized"] \
                    .apply(lambda tokens: \
                    [lemmatizer.lemmatize(token) \
                     for token in tokens])

In [None]:
data['lemmatized'][0]

In [None]:
data.head()

## N-grams

In [None]:
tokens_clean = sum(data['lemmatized'], [])

In [None]:
# unigrams: n=1
unigrams = (pd.Series \
            (nltk.ngrams(tokens_clean, 1)) \
            .value_counts()) 
print(unigrams)

In [None]:
# bigrams: n=2
bigrams = (pd.Series \
           (nltk.ngrams(tokens_clean, 2)) \
           .value_counts()) 
print(bigrams)

In [None]:
ngrams_4 = (pd.Series \
            (nltk.ngrams(tokens_clean, 4)) \
            .value_counts()) 
print(ngrams_4)