In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import AdaBoostClassifier

In [4]:
# Loading data from a CSV file
df = pd.read_csv('balanced_rt_reviews_20240207-193333.csv')  # Adjust the filename to your actual file path

# Count the number of NaN or empty strings in 'review_detail' for the entire dataset
missing_or_empty_count = df['review_detail'].isna().sum() + (df['review_detail'] == '').sum()

# Print the count
print("Total missing or empty 'review_detail' values in the dataset:", missing_or_empty_count)

df = df.dropna(subset=['review_detail'])

Total missing or empty 'review_detail' values in the dataset: 7


In [5]:
texts = df['review_detail'].values
top_critics = df['top_critic'].astype(int).values  # Convert boolean to integers (True/False to 1/0)

# Vectorizing text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, top_critics, test_size=0.2, random_state=42)

# Training a model with AdaBoost
model = AdaBoostClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report
import numpy as np

# Predicting
predictions = model.predict(X_test)

# Evaluating with accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Calculate and print MAE
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)

# Calculate and print RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("Root Mean Squared Error (RMSE):", rmse)

# Generating and printing the classification report
report = classification_report(y_test, predictions, target_names=["False","True"])
print("Classification Report:")
print(report)

Accuracy: 0.5540571255398137
Mean Absolute Error (MAE): 0.4459428744601864
Root Mean Squared Error (RMSE): 0.6677895435391201
Classification Report:
              precision    recall  f1-score   support

       False       0.59      0.39      0.47      6660
        True       0.54      0.72      0.62      6539

    accuracy                           0.55     13199
   macro avg       0.56      0.56      0.54     13199
weighted avg       0.56      0.55      0.54     13199



In [10]:
# Custom string for prediction
custom_text = "not bad. i really liked the movie, altough the ass of the actress was too small"

# Preprocess the custom string
custom_text_vectorized = vectorizer.transform([custom_text])

# Predict using the trained model
custom_prediction = model.predict(custom_text_vectorized)

# Decode the predicted sentiment
predicted_sentiment = encoder.inverse_transform([custom_prediction])[0]

# Optionally, if you have the actual sentiment for the custom string:
actual_sentiment = "POSITIVE"  # Replace with the actual sentiment for your custom string

# Calculate accuracy for the custom prediction (if you have the actual sentiment)
if actual_sentiment:
    accuracy_custom = 1 if custom_prediction == encoder.transform([[actual_sentiment]])[0] else 0
    print(f"Actual Sentiment: {actual_sentiment}")
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print(f"Accuracy for Custom String: {accuracy_custom}")
else:
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print("Actual sentiment not provided, cannot calculate accuracy.")

NameError: name 'encoder' is not defined