In [26]:
import numpy as np
import pandas as pd
import random
import nltk
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

# Reading in the data

In [27]:
df = pd.read_parquet('data/edmonton_cleaned.parquet', engine='auto')
df = df.head(5)

In [28]:
df.shape

(5, 22)

# Tokenization & Data Splits

In [29]:
# splitting the reviews
positive = df[df['stars_y'] > 3]
negative = df[df['stars_y'] <= 3]
positive

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,review_count,...,categories,hours,review_id,user_id,stars_y,useful,funny,cool,text,date
1,WKMJwqnfZKsAae75RMP6jA,Roast Coffeehouse and Wine Bar,10359 104 Street NW,Edmonton,AB,T5J 1B9,53.546045,-113.499169,4.0,40,...,"Coffee & Tea, Food, Cafes, Bars, Wine Bars, Re...","{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",bAy8ROEYO_3aTBhW5LoR4g,7qFH1RkPivVRcwxLwhyixg,4,2,0,1,I'm not a coffee connoisseur so I'm not review...,2013-05-28 23:16:30
2,WKMJwqnfZKsAae75RMP6jA,Roast Coffeehouse and Wine Bar,10359 104 Street NW,Edmonton,AB,T5J 1B9,53.546045,-113.499169,4.0,40,...,"Coffee & Tea, Food, Cafes, Bars, Wine Bars, Re...","{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",zPDHE7TrXs7EJT06qD8yTA,FLeyjgc05C2V6QI9nVQ48Q,4,0,0,0,"I really loved it here, makes me wish I lived ...",2013-09-04 19:49:33
4,WKMJwqnfZKsAae75RMP6jA,Roast Coffeehouse and Wine Bar,10359 104 Street NW,Edmonton,AB,T5J 1B9,53.546045,-113.499169,4.0,40,...,"Coffee & Tea, Food, Cafes, Bars, Wine Bars, Re...","{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",pLnTjS90gUlsq2tAjI9prA,G3h8pIclwUbuu3itJqF7ug,4,7,0,4,"With a Toast to Roast, I say welcome to the E-...",2012-09-11 23:54:24


In [30]:
# tokenizes positive and negative reviews
pos_tokens = positive.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

neg_tokens = negative.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [31]:
pos_tokens[0:5]

1    [I, 'm, not, a, coffee, connoisseur, so, I, 'm...
2    [I, really, loved, it, here, ,, makes, me, wis...
4    [With, a, Toast, to, Roast, ,, I, say, welcome...
dtype: object

# Normalization

In [32]:
pos_tags = pos_tokens.apply(pos_tag)
neg_tags = neg_tokens.apply(pos_tag)

In [33]:
pos_tags[0:5]

1    [(I, PRP), ('m, VBP), (not, RB), (a, DT), (cof...
2    [(I, PRP), (really, RB), (loved, VBD), (it, PR...
4    [(With, IN), (a, DT), (Toast, NNP), (to, TO), ...
dtype: object

# Lemmatization

In [34]:
def lemmatize_sentence(tags):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in tags:
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

pos_lemma = pos_tags.apply(lemmatize_sentence)
neg_lemma = neg_tags.apply(lemmatize_sentence)

In [35]:
pos_lemma

1    [I, 'm, not, a, coffee, connoisseur, so, I, 'm...
2    [I, really, love, it, here, ,, make, me, wish,...
4    [With, a, Toast, to, Roast, ,, I, say, welcome...
dtype: object

# Noise Filtering

In [36]:
#tba -> needs text investigation

# todo
- word density analysis (maybe in EDA)

# Creating Datasets

- for Naive Bayes
- Other Models require a bit more research

In [37]:
def get_reviews_for_model(tokens_list):
    for review_token in tokens_list:
        yield dict([token, True] for token in review_token)

pos_token_for_model = get_reviews_for_model(pos_lemma)
neg_token_for_model = get_reviews_for_model(neg_lemma)

In [38]:
pos_df = [(review_dict, "Positive") for review_dict in pos_token_for_model]
neg_df = [(review_dict, "Positive") for review_dict in neg_token_for_model]

model_df = pos_df + neg_df

In [47]:
#train test split - move to modeling

train_data = model_df[:3]
test_data = model_df[3:]

In [48]:
#todo: find out how to save the data
pd.DataFrame(model_df)

Unnamed: 0,0,1
0,"{'I': True, ''m': True, 'not': True, 'a': True...",Positive
1,"{'I': True, 'really': True, 'love': True, 'it'...",Positive
2,"{'With': True, 'a': True, 'Toast': True, 'to':...",Positive
3,"{'So': True, 'much': True, 'to': True, 'like':...",Positive
4,"{'The': True, 'kid': True, 'who': True, 'work'...",Positive


# Simple Naive Bayes Model
- Move to modeling notebook

In [49]:
from nltk import classify
from nltk import NaiveBayesClassifier

In [50]:
classifier = NaiveBayesClassifier.train(train_data)
print('Accuracy is: ', classify.accuracy(classifier, test_data))

Accuracy is:  1.0


In [51]:
# sample prediction
classifier.classify(test_data[0][0])

'Positive'