In [1]:
# Random Forest

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from sklearn.ensemble import RandomForestClassifier 

def getAndTrainModel(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model 

In [3]:
# Loading data from a CSV file
df = pd.read_csv('balanced_rt_reviews_20240207-193333.csv')  # Adjust the filename to your actual file path

# Count the number of NaN or empty strings in 'review_detail' for the entire dataset
missing_or_empty_count = df['review_detail'].isna().sum() + (df['review_detail'] == '').sum()

# Print the count
print("Total missing or empty 'review_detail' values in the dataset:", missing_or_empty_count)

df = df.dropna(subset=['review_detail'])

Total missing or empty 'review_detail' values in the dataset: 7


In [4]:
# Preparing the input features and target variable
texts = df['review_detail'].values
y = df['top_critic'].astype(int).values  # Assuming 'top_critic' is boolean and directly using it as target

# Vectorizing text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a model with Random Forest
model = getAndTrainModel(X_train=X_train, y_train=y_train)

In [5]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import numpy as np

# Predicting
predictions = model.predict(X_test)

# Evaluating with accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Calculate and print MAE
ra = roc_auc_score(y_test, predictions)
print("ROC AUC score:", ra)


# Generating and printing the classification report
report = classification_report(y_test, predictions, target_names=["False","True"])
print("Classification Report:")
print(report)

Accuracy: 0.5754981437987726
ROC AUC score: 0.5759827268773591
Classification Report:
              precision    recall  f1-score   support

       False       0.59      0.52      0.55      6660
        True       0.56      0.63      0.59      6539

    accuracy                           0.58     13199
   macro avg       0.58      0.58      0.57     13199
weighted avg       0.58      0.58      0.57     13199

