In [1]:
import pandas as pd

reviews = pd.read_json('data/ys-reviews-with-categories.json')

reviews.head()

Unnamed: 0,text,rating,gmap_id,category
0,Helped me alot. If it was for her suggesting m...,5.0,0x809ad993cd15cc69:0x38291ec9a11f59ed,beauty
1,Always leave feeling refreshed...polite and re...,5.0,0x809ad993cd15cc69:0x38291ec9a11f59ed,beauty
2,Professional massage place. The ladies know w...,5.0,0x809ad993cd15cc69:0x38291ec9a11f59ed,beauty
3,This place was pleasing to me. Very nice staff...,5.0,0x809ad993cd15cc69:0x38291ec9a11f59ed,beauty
4,Tough lady loves great pressure massage. Woohoo.,5.0,0x809ad993cd15cc69:0x38291ec9a11f59ed,beauty


In [7]:
# sample bigram classification from medium website
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.util import ngrams

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation

stop_words = stopwords.words('english')

def filtered_tokens(text: str) -> list[str]:
    # make lower case and remove all punctuations
    text = text.lower().translate(str.maketrans('', '', punctuation))
    tokens = word_tokenize(text)
    # filter stopwords    
    return [word for word in tokens if word not in stop_words]

# Function to extract bigrams from a text
def extract_ngrams(text: str, n: int = 2) -> list[tuple[str, str]]:
    tokens = filtered_tokens(text)
    return list(ngrams(tokens, n))

def create_ngram_features(text:str, n: int = 2):
    ngram = extract_ngrams(text, n)
    return dict([(ng, True) for ng in ngram])


n_of_grams = 4

some = reviews.loc[(reviews.category == 'restaurant')].sample(10000)

review = some.text.tolist()
label = ["positive" if r > 4 else "negative" for r in some.rating.tolist()]

# Combine the reviews into a single dataset
review_w_label = list(zip(review, label))

# Split the data into training and testing sets
training_data = review_w_label[:int(0.8 * len(review_w_label))]
testing_data = review_w_label[int(0.8 * len(review_w_label)):]

# Extract bigrams from the reviews and create a feature set
features = [(create_ngram_features(review, n_of_grams), label) for (review, label) in training_data]

# Train a Naive Bayes classifier on the feature set
classifier = NaiveBayesClassifier.train(features)

# Evaluate the classifier on the testing data
accuracy = nltk.classify.accuracy(classifier, [(create_ngram_features(review, n_of_grams), label) for (review, label) in testing_data])
print("Accuracy:", accuracy)

# Predict the sentiment of a new review
review = "please go anywhere else this place sucks"
features = create_ngram_features(review, n_of_grams)
sentiment = classifier.classify(features)
print("Test:", review)
print("Sentiment:", sentiment)

classifier.show_most_informative_features(20)

Accuracy: 0.604
Test: please go anywhere else this place sucks
Sentiment: positive
Most Informative Features
('great', 'food', 'great', 'service') = True           positi : negati =      7.1 : 1.0
('great', 'service', 'great', 'food') = True           positi : negati =      4.6 : 1.0
('good', 'food', 'good', 'prices') = True           negati : positi =      3.2 : 1.0
('food', 'great', 'customer', 'service') = True           positi : negati =      3.1 : 1.0
('great', 'food', 'great', 'staff') = True           positi : negati =      3.1 : 1.0
('translated', 'google', 'good', 'food') = True           negati : positi =      2.5 : 1.0
('could', 'give', 'stars', 'would') = True           negati : positi =      2.3 : 1.0
('every', 'single', 'time', 'go') = True           negati : positi =      2.3 : 1.0
('friendly', 'service', 'food', 'good') = True           negati : positi =      2.3 : 1.0
('good', 'food', 'decent', 'prices') = True           negati : positi =      2.3 : 1.0
('good', 'food'

In [8]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '