In [1]:
import pandas as pd
yelp = pd.read_json('./yelp_dataset/yelp_academic_dataset_review.json', lines=True, nrows=100000)
# yelp = pd.read_json('./yelp_dataset/yelp_academic_dataset_review.json', lines=True)
yelp['text'] = yelp['text'].astype('str')


In [2]:
from faker import Faker
import numpy as np

fake = Faker()

reviews = [fake.paragraph() for _ in range(10000)]

# create a list of random stars for the reviews
stars = np.random.choice([1, 2, 3, 4, 5], size=10000)

# create a list of random number of useful, funny, and cool votes for the reviews
useful = np.random.randint(1, 5, size=10000)
funny = np.random.randint(1, 5, size=10000)
cool = np.random.randint(1, 5, size=10000)

fake_yelp = pd.DataFrame({'text': reviews,
                     'stars': stars,
                     'useful': useful,
                     'funny': funny,
                     'cool': cool})

# yelp = fake_yelp

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from tabulate import tabulate
import time

start_time = time.time()

class NBClassifier:
    def __init__(self):
        pass
    
    def get_trained_data(self, x, target_data):
        X_train, X_test, y_train, y_test = train_test_split(x, target_data, train_size=0.8) 
        # X_valid, X_test, y_valid, y_test = train_test_split(X_remaining, y_remaining, test_size=0.5)

        # Train the Naive Bayes classifier
        nb_classifier = MultinomialNB(alpha = 0.1)
        nb_classifier.fit(X_train, y_train)

        # Predict the sentiment for the test data
        y_pred = nb_classifier.predict(X_test)

        # Evaluate the performance of the classifier
        print(metrics.accuracy_score(y_test, y_pred))
        return metrics.accuracy_score(y_test, y_pred)

input_columns = ['text']
irrelevant_columns = ['review_id', 'user_id', 'business_id', 'date']
output_columns = ['stars', 'useful', 'funny', 'cool']

# ############## EXPERIMENT 2
# half_size = len(yelp.index) // 2
# rand_indices = np.random.choice(yelp.index, half_size, replace=False)

# # update the selected rows to have a value of 1
# yelp.loc[rand_indices, "stars"] = 1
# ############## EXPERIMENT 2

# Vectorize the training and testing data
x = yelp.drop(columns=output_columns)
v = CountVectorizer(ngram_range=(1, 1))
x = v.fit_transform(x['text'])

unigram_tf_idf_transformer = TfidfTransformer()
unigram_tf_idf_transformer.fit(x)

x = unigram_tf_idf_transformer.transform(x)
y = yelp.drop(columns=input_columns)

nb_stars = NBClassifier()
nb_useful = NBClassifier()
nb_funny = NBClassifier()
nb_cool = NBClassifier()

nb_stars_score = nb_stars.get_trained_data(x, y["stars"])
nb_useful_score = nb_useful.get_trained_data(x, y["useful"])
nb_funny_score = nb_funny.get_trained_data(x, y["funny"])
nb_cool_score = nb_cool.get_trained_data(x, y["cool"])

table_MultinomialNB = ['Score (Stars)', 'Score (Useful)', 'Score (Funny)', 'Score (Cool)'], [nb_stars_score, nb_useful_score, nb_funny_score, nb_cool_score]

print('Table For Naive Bayes Probabilistic Classifier')
print(tabulate(table_MultinomialNB, headers='firstrow'))

elapsed_time = time.time() - start_time
print("Elapsed time: ", elapsed_time, " seconds")

0.77405
0.58635
0.85195
0.7986
Table For Naive Bayes Probabilistic Classifier
  Score (Stars)    Score (Useful)    Score (Funny)    Score (Cool)
---------------  ----------------  ---------------  --------------
        0.77405           0.58635          0.85195          0.7986
Elapsed time:  6.532774925231934  seconds
