In [1]:
import pandas as pd

reviews = pd.read_json('data/ys-reviews-with-categories.json')

reviews.head()

Unnamed: 0,text,rating,gmap_id,category
0,Helped me alot. If it was for her suggesting m...,5.0,0x809ad993cd15cc69:0x38291ec9a11f59ed,beauty
1,Always leave feeling refreshed...polite and re...,5.0,0x809ad993cd15cc69:0x38291ec9a11f59ed,beauty
2,Professional massage place. The ladies know w...,5.0,0x809ad993cd15cc69:0x38291ec9a11f59ed,beauty
3,This place was pleasing to me. Very nice staff...,5.0,0x809ad993cd15cc69:0x38291ec9a11f59ed,beauty
4,Tough lady loves great pressure massage. Woohoo.,5.0,0x809ad993cd15cc69:0x38291ec9a11f59ed,beauty


In [4]:
# sample bigram classification from medium website
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.util import ngrams

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation

stop_words = stopwords.words('english')

def filtered_tokens(text: str) -> list[str]:
    # make lower case and remove all punctuations
    text = text.lower().translate(str.maketrans('', '', punctuation))
    tokens = word_tokenize(text)
    # filter stopwords    
    return [word for word in tokens if word not in stop_words]

# Function to extract bigrams from a text
def extract_ngrams(text: str, n: int = 2) -> list[tuple[str, str]]:
    tokens = filtered_tokens(text)
    return list(ngrams(tokens, n))

def create_ngram_features(text:str, n: int = 2):
    ngram = extract_ngrams(text, n)
    return dict([(ng, True) for ng in ngram])


n_of_grams = 2

some = reviews.loc[(reviews.category == 'restaurant')].sample(10000)

review = some.text.tolist()
label = ["positive" if r > 4 else "negative" for r in some.rating.tolist()]

# Combine the reviews into a single dataset
review_w_label = list(zip(review, label))

# Split the data into training and testing sets
training_data = review_w_label[:int(0.8 * len(review_w_label))]
testing_data = review_w_label[int(0.8 * len(review_w_label)):]

# Extract bigrams from the reviews and create a feature set
features = [(create_ngram_features(review, n_of_grams), label) for (review, label) in training_data]

# Train a Naive Bayes classifier on the feature set
classifier = NaiveBayesClassifier.train(features)

# Evaluate the classifier on the testing data
accuracy = nltk.classify.accuracy(classifier, [(create_ngram_features(review, n_of_grams), label) for (review, label) in testing_data])
print("Accuracy:", accuracy)

# Predict the sentiment of a new review
review = "please go somewhere else this place sucks"
features = create_ngram_features(review, n_of_grams)
sentiment = classifier.classify(features)
print("Test:", review)
print("Sentiment:", sentiment)

classifier.show_most_informative_features()

Accuracy: 0.695
Test: please go somewhere else this place sucks
Sentiment: negative
Most Informative Features
       ('30', 'minutes') = True           negati : positi =     20.5 : 1.0
          ('food', 'ok') = True           negati : positi =     20.5 : 1.0
         ('never', 'go') = True           negati : positi =     17.8 : 1.0
      ('tasted', 'like') = True           negati : positi =     17.8 : 1.0
       ('didnt', 'even') = True           negati : positi =     14.1 : 1.0
     ('amazing', 'food') = True           positi : negati =     13.3 : 1.0
        ('came', 'back') = True           negati : positi =     13.2 : 1.0
         ('got', 'home') = True           negati : positi =     13.2 : 1.0
     ('poor', 'service') = True           negati : positi =     13.2 : 1.0
     ('service', 'ever') = True           negati : positi =     12.3 : 1.0
