In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import MultinomialNB  # Import the Multinomial Naive Bayes classifier

In [2]:
from sklearn.naive_bayes import MultinomialNB  # Import the Multinomial Naive Bayes classifier

def getAndTrainModel(X_train, y_train):
    model = MultinomialNB() 
    model.fit(X_train, y_train)
    return model 

In [3]:
# Loading data from a CSV file
df = pd.read_csv('balanced_rt_reviews_20240207-193333.csv')  # Adjust the filename to your actual file path

# Count the number of NaN or empty strings in 'review_detail' for the entire dataset
missing_or_empty_count = df['review_detail'].isna().sum() + (df['review_detail'] == '').sum()

# Print the count
print("Total missing or empty 'review_detail' values in the dataset:", missing_or_empty_count)

df = df.dropna(subset=['review_detail'])

Total missing or empty 'review_detail' values in the dataset: 7


In [4]:
texts = df['review_detail'].values
# Assuming 'top_critic' is boolean, convert directly to integer (1 for True, 0 for False)
top_critics = df['top_critic'].astype(int).values

# Vectorizing text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, top_critics, test_size=0.2, random_state=42)

# Check if the test set is too small
if X_test.shape[0] < 1:
    raise ValueError("Test set is too small. Consider reducing the test_size parameter or adding more data.")

# Training a model using Multinomial Naive Bayes
model = getAndTrainModel(X_train=X_train, y_train=y_train)

In [6]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import numpy as np

# Predicting
predictions = model.predict(X_test)

# Evaluating with accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Calculate and print MAE
ra = roc_auc_score(y_test, predictions)
print("ROC AUC score:", ra)


# Generating and printing the classification report
report = classification_report(y_test, predictions, target_names=["False","True"])
print("Classification Report:")
print(report)

Accuracy: 0.5814834457155845
ROC AUC score: 0.5811398644400633
Classification Report:
              precision    recall  f1-score   support

       False       0.58      0.62      0.60      6660
        True       0.58      0.54      0.56      6539

    accuracy                           0.58     13199
   macro avg       0.58      0.58      0.58     13199
weighted avg       0.58      0.58      0.58     13199

