Loaded data.
Extracted labels. 
Split train:test data.
Assigned binary labels.

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split


from datasets import load_dataset

ta_dataset = load_dataset("jniimi/tripadvisor-review-rating")['train']

train_data = [item['text'] for item in ta_dataset]
train_data_labels = [int(item['overall']) for item in ta_dataset]

X_train, X_test, y_train, y_test = train_test_split(train_data, train_data_labels, test_size=0.2, random_state=42)

def one_vs_all_labels(labels, target_class):
    return [1 if label == target_class else 0 for label in labels]

Trained, fit and saved Linear Regression based model.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
import pickle

lr_classifiers = {}
for star in range(1, 6):
    binary_y_train = one_vs_all_labels(y_train, star)
    

    clf = make_pipeline(
        TfidfVectorizer(analyzer='word',max_features=5000,lowercase=True),
        LogisticRegression(max_iter=1000)
    )
    
    clf.fit(X_train, binary_y_train)
    lr_classifiers[star] = clf

with open('lr_classifiers.pkl', 'wb') as f:
    pickle.dump(lr_classifiers, f)    

for star in range(1, 6):
    binary_y_test = one_vs_all_labels(y_test, star)
    y_pred = lr_classifiers[star].predict(X_test)

def predict_star_rating(lr_classifiers, review_text):
    probabilities = {star: clf.predict_proba([review_text])[0][1] for star, clf in lr_classifiers.items()}
    return max(probabilities, key=probabilities.get)

predicted_ratings = [predict_star_rating(lr_classifiers, review) for review in X_test]

overall_accuracy = accuracy_score(y_test, predicted_ratings)
print(f"Overall Accuracy: {overall_accuracy:.2f}")





Overall Accuracy: 0.65


Trained, fit and saved Multinomial Naive Bayes based model.

In [7]:
from sklearn.naive_bayes import MultinomialNB

nb_classifiers = {}
for star in range(1, 6):
    binary_y_train = one_vs_all_labels(y_train, star)
    

    clf = make_pipeline(
        TfidfVectorizer(analyzer='word',max_features=5000,lowercase=True),
        MultinomialNB()
    )
    
    clf.fit(X_train, binary_y_train)
    nb_classifiers[star] = clf

with open('nb_classifiers.pkl', 'wb') as f:
    pickle.dump(nb_classifiers, f)

for star in range(1, 6):
    binary_y_test = one_vs_all_labels(y_test, star)
    y_pred = nb_classifiers[star].predict(X_test)

def predict_star_rating(nb_classifiers, review_text):
    probabilities = {star: clf.predict_proba([review_text])[0][1] for star, clf in nb_classifiers.items()}
    return max(probabilities, key=probabilities.get)

predicted_ratings = [predict_star_rating(nb_classifiers, review) for review in X_test]

overall_accuracy = accuracy_score(y_test, predicted_ratings)
print(f"Overall Accuracy: {overall_accuracy:.2f}")

Overall Accuracy: 0.60


Trained, fit and saved Random Forest based model.

In [9]:
from sklearn.ensemble import RandomForestClassifier

rf_classifiers = {}
for star in range(1, 6):
    binary_y_train = one_vs_all_labels(y_train, star)
    

    clf = make_pipeline(
        TfidfVectorizer(analyzer='word',max_features=5000,lowercase=True),
        RandomForestClassifier(n_estimators=10, random_state=42)
    )
    
    clf.fit(X_train, binary_y_train)
    rf_classifiers[star] = clf

with open('rf_classifiers.pkl', 'wb') as f:
    pickle.dump(rf_classifiers, f)

for star in range(1, 6):
    binary_y_test = one_vs_all_labels(y_test, star)
    y_pred = rf_classifiers[star].predict(X_test)

def predict_star_rating(rf_classifiers, review_text):
    probabilities = {star: clf.predict_proba([review_text])[0][1] for star, clf in rf_classifiers.items()}
    return max(probabilities, key=probabilities.get)

predicted_ratings = [predict_star_rating(rf_classifiers, review) for review in X_test]

overall_accuracy = accuracy_score(y_test, predicted_ratings)
print(f"Overall Accuracy: {overall_accuracy:.2f}")

Overall Accuracy: 0.55
