# Create test data sample with 33k rows per sentiment

In [4]:
import pandas as pd
import time

# Load data from CSV
df = pd.read_csv('rt_data.csv')  # Adjust path accordingly

# Desired sample size of each category for top critics and non-top critics
sample_size = 33000

# Sample from top critics
df_top_critics = df[df['top_critic'] == True].sample(
    n=min(len(df[df['top_critic'] == True]), sample_size), random_state=42)

# Sample from non-top critics
df_not_top_critics = df[df['top_critic'] == False].sample(
    n=min(len(df[df['top_critic'] == False]), sample_size), random_state=42)

# Combine the samples
df_balanced = pd.concat([df_top_critics, df_not_top_critics])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a new CSV file
timestr = time.strftime("%Y%m%d-%H%M%S")
df_balanced.to_csv(f'balanced_rt_reviews_{timestr}.csv', index=False)


  df = pd.read_csv('rt_data.csv')  # Adjust path accordingly


# logistic regression

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV

In [11]:
# Loading data from a CSV file
df = pd.read_csv('balanced_rt_reviews_20240207-193333.csv')  # Adjust the filename to your actual file path

# Count the number of NaN or empty strings in 'review_detail' for the entire dataset
missing_or_empty_count = df['review_detail'].isna().sum() + (df['review_detail'] == '').sum()

# Print the count
print("Total missing or empty 'review_detail' values in the dataset:", missing_or_empty_count)

df = df.dropna(subset=['review_detail'])

Total missing or empty 'review_detail' values in the dataset: 7


In [26]:
texts = df['review_detail'].values
# If 'top_critic' is boolean, you can directly use it as an integer target variable
top_critics = df['top_critic'].astype(int).values  # Converts boolean to 0 (False) and 1 (True)

# Vectorizing text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, top_critics, test_size=0.2, random_state=42)

# Ensure the test set is not too small
if X_test.shape[0] < 1:
    raise ValueError("Test set is too small. Consider reducing the test_size parameter or adding more data.")

# Training a model with adjusted parameters
model = LogisticRegression(solver='saga', multi_class='auto', max_iter=2000, class_weight='balanced')
model.fit(X_train, y_train)

In [27]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report
import numpy as np

# Predicting
predictions = model.predict(X_test)

# Evaluating with accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Calculate and print MAE
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)

# Calculate and print RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("Root Mean Squared Error (RMSE):", rmse)

# Generating and printing the classification report
report = classification_report(y_test, predictions, target_names=["False","True"])
print("Classification Report:")
print(report)

Accuracy: 0.5849685582241079
Mean Absolute Error (MAE): 0.41503144177589213
Root Mean Squared Error (RMSE): 0.6442293394249381
Classification Report:
              precision    recall  f1-score   support

       False       0.59      0.56      0.58      6660
        True       0.58      0.61      0.59      6539

    accuracy                           0.58     13199
   macro avg       0.59      0.59      0.58     13199
weighted avg       0.59      0.58      0.58     13199



In [28]:
# Custom string for prediction
custom_text = "not bad. i really liked the movie, altough the ass of the actress was in my opinion not big enough. "

# Preprocess the custom string
custom_text_vectorized = vectorizer.transform([custom_text])

# Predict using the trained model
custom_prediction = model.predict(custom_text_vectorized)

# Decode the predicted sentiment
predicted_sentiment = encoder.inverse_transform([custom_prediction])

print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: [[False]]
