# Yelp Review Classifier

In [1]:
# https://medium.com/@ageitgey/text-classification-is-your-new-secret-weapon-7ca4fad15788
# data: https://www.yelp.com/dataset/download
import json
from pathlib import Path
import re
import random
import fastText

In [2]:
#PRE PROCESS, Only run to make/clean new data sets
reviews_data = Path("dataset") / "review.json"
training_data = Path("fasttext_dataset_training.txt")
test_data = Path("fasttext_dataset_test.txt")

# What percent of data to save separately as test data
percent_test_data = 0.10

def strip_formatting(string):
    string = string.lower()
    string = re.sub(r"([.!?,'/()])", r" \1 ", string)
    return string

with reviews_data.open() as input, \
     training_data.open("w") as train_output, \
     test_data.open("w") as test_output:

    for line in input:
        review_data = json.loads(line)

        rating = review_data['stars']
        text = review_data['text'].replace("\n", " ")
        text = strip_formatting(text)

        fasttext_line = "__label__{} {}".format(rating, text)

        if random.random() <= percent_test_data:
            test_output.write(fasttext_line + "\n")
        else:
            train_output.write(fasttext_line + "\n")

In [3]:
def strip_formatting(string):
    string = string.lower()
    string = re.sub(r"([.!?,'/()])", r" \1 ", string)
    return string

In [4]:
# Reviews to check
reviews = [
    "This restaurant literally changed my life. This is the best food I've ever eaten!",
    "I hate this place so much. They were mean to me.",
    "I don't know. It was ok, I guess. Not really sure what to say."
]

In [5]:
# Pre-process the text of each review so it matches the training format
preprocessed_reviews = list(map(strip_formatting, reviews))

In [7]:
# Load the model
classifier = fastText.load_model('../../../fasttext/reviews_model_ngrams.bin')

In [8]:
# Get fastText to classify each review with the model
labels, probabilities = classifier.predict(preprocessed_reviews, 1)

In [9]:
# Print the results
for review, label, probability in zip(reviews, labels, probabilities):
    stars = int(label[0][-1])

    print("{} ({}% confidence)".format("☆" * stars, int(probability[0] * 100)))
    print(review)
    print()

 (100% confidence)
This restaurant literally changed my life. This is the best food I've ever eaten!

 (97% confidence)
I hate this place so much. They were mean to me.

 (83% confidence)
I don't know. It was ok, I guess. Not really sure what to say.

