In [None]:
# Random Forest

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report

In [None]:
# Loading data from a CSV file
df = pd.read_csv('balanced_imdb_reviews_20240205-173555.csv')  # Replace with your actual file path

# Handling missing or empty strings in 'review_detail'
print("Missing or empty 'review_detail' values per sentiment category:")
missing_or_empty_count = df[df['review_detail'].isna() | (df['review_detail'] == '')].groupby('sentiment').size()
print(missing_or_empty_count)
df['review_detail'] = df['review_detail'].fillna('')  # Fill NaN values

# Encoding and vectorizing
texts = df['review_detail'].values
sentiments = df['sentiment'].values
encoder = OrdinalEncoder(categories=[["NEGATIVE", "MEDIUM", "POSITIVE"]])
y = encoder.fit_transform(sentiments.reshape(-1, 1)).flatten()  # Reshape needed for a single feature

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a model with Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)  # Initialize Random Forest
model.fit(X_train, y_train)

In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report
import numpy as np

# Predicting
predictions = model.predict(X_test)

# Evaluating with accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Calculate and print MAE
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)

# Calculate and print RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("Root Mean Squared Error (RMSE):", rmse)

# Generating and printing the classification report
report = classification_report(y_test, predictions, target_names=["NEGATIVE", "MEDIUM", "POSITIVE"])
print("Classification Report:")
print(report)

Accuracy: 0.6495959595959596
Mean Absolute Error (MAE): 0.40954545454545455
Root Mean Squared Error (RMSE): 0.7265179163849181
Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.69      0.75      0.72      6714
      MEDIUM       0.57      0.54      0.55      6590
    POSITIVE       0.69      0.66      0.68      6496

    accuracy                           0.65     19800
   macro avg       0.65      0.65      0.65     19800
weighted avg       0.65      0.65      0.65     19800

