In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import AdaBoostClassifier

In [2]:
# Loading data from a CSV file
df = pd.read_csv('balanced_imdb_reviews_20240205-173555.csv')  # Replace with your actual file path

# Handling missing or empty strings in 'review_detail'
print("Missing or empty 'review_detail' values per sentiment category:")
missing_or_empty_count = df[df['review_detail'].isna() | (df['review_detail'] == '')].groupby('sentiment').size()
print(missing_or_empty_count)
df['review_detail'] = df['review_detail'].fillna('')  # Fill NaN values

Missing or empty 'review_detail' values per sentiment category:
sentiment
NEGATIVE    1
dtype: int64


In [3]:
# Encoding ordinal categories
encoder = OrdinalEncoder(categories=[["NEGATIVE", "MEDIUM", "POSITIVE"]])
y = encoder.fit_transform(df['sentiment'].values.reshape(-1, 1)).flatten()

# Vectorizing text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review_detail'].values)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a model with AdaBoost
model = AdaBoostClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report
import numpy as np

# Predicting
predictions = model.predict(X_test)

# Evaluating with accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Calculate and print MAE
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)

# Calculate and print RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("Root Mean Squared Error (RMSE):", rmse)

# Generating and printing the classification report
report = classification_report(y_test, predictions, target_names=["NEGATIVE", "MEDIUM", "POSITIVE"])
print("Classification Report:")
print(report)

Accuracy: 0.6876767676767677
Mean Absolute Error (MAE): 0.366010101010101
Root Mean Squared Error (RMSE): 0.6880289517046781
Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.75      0.74      0.74      6714
      MEDIUM       0.62      0.59      0.60      6590
    POSITIVE       0.70      0.73      0.71      6496

    accuracy                           0.69     19800
   macro avg       0.69      0.69      0.69     19800
weighted avg       0.69      0.69      0.69     19800



In [8]:
# Custom string for prediction
custom_text = "not bad. i really liked the movie, altough the ass of the actress was too small"

# Preprocess the custom string
custom_text_vectorized = vectorizer.transform([custom_text])

# Predict using the trained model
custom_prediction = model.predict(custom_text_vectorized)

# Decode the predicted sentiment
predicted_sentiment = encoder.inverse_transform([custom_prediction])[0]

# Optionally, if you have the actual sentiment for the custom string:
actual_sentiment = "POSITIVE"  # Replace with the actual sentiment for your custom string

# Calculate accuracy for the custom prediction (if you have the actual sentiment)
if actual_sentiment:
    accuracy_custom = 1 if custom_prediction == encoder.transform([[actual_sentiment]])[0] else 0
    print(f"Actual Sentiment: {actual_sentiment}")
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print(f"Accuracy for Custom String: {accuracy_custom}")
else:
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print("Actual sentiment not provided, cannot calculate accuracy.")

Actual Sentiment: POSITIVE
Predicted Sentiment: ['NEGATIVE']
Accuracy for Custom String: 0
