In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis  # Import LDA classifier

In [5]:
# Loading data from a CSV file
df = pd.read_csv('balanced_imdb_reviews_20240205-173555.csv')  # Replace with your actual file path

# Handling missing or empty strings in 'review_detail'
print("Missing or empty 'review_detail' values per sentiment category:")
missing_or_empty_count = df[df['review_detail'].isna() | (df['review_detail'] == '')].groupby('sentiment').size()
print(missing_or_empty_count)
df['review_detail'] = df['review_detail'].fillna('')  # Fill NaN values

# Downsampling to 5,000 samples per sentiment category
sampled_df = pd.DataFrame()  # Initialize an empty dataframe
for sentiment in df['sentiment'].unique():
    sampled_df = pd.concat([sampled_df, df[df['sentiment'] == sentiment].sample(n=5000, random_state=42)])

# Resetting the index of the sampled dataframe
sampled_df = sampled_df.reset_index(drop=True)

print(f"New dataset size: {sampled_df.shape}")
print(sampled_df['sentiment'].value_counts())

Missing or empty 'review_detail' values per sentiment category:
sentiment
NEGATIVE    1
dtype: int64
New dataset size: (15000, 5)
sentiment
POSITIVE    5000
MEDIUM      5000
NEGATIVE    5000
Name: count, dtype: int64


In [6]:
texts = sampled_df['review_detail'].values
sentiments = sampled_df['sentiment'].values

# Encoding ordinal categories
encoder = OrdinalEncoder(categories=[["NEGATIVE", "MEDIUM", "POSITIVE"]])
y = encoder.fit_transform(sentiments.reshape(-1, 1)).flatten()  # Reshape is needed for a single feature

# Vectorizing text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check if the test set is too small
if X_test.shape[0] < 1:
    raise ValueError("Test set is too small. Consider reducing the test_size parameter or adding more data.")

# Training a model using Linear Discriminant Analysis
model = LinearDiscriminantAnalysis()  # Initialize the LDA model
model.fit(X_train.toarray(), y_train)  # Convert sparse matrix to array for LDA

MemoryError: Unable to allocate 2.85 GiB for an array with shape (4007, 95391) and data type float64

In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, classification_report
import numpy as np

# Predicting
predictions = model.predict(X_test)

# Evaluating with accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Generating and printing the classification report
report = classification_report(y_test, predictions, target_names=["NEGATIVE", "MEDIUM", "POSITIVE"])
print("Classification Report:")
print(report)

In [13]:
# Custom string for prediction
custom_text = "not bad. i really liked the movie, altough the ass of the actress was too small"

# Preprocess the custom string
custom_text_vectorized = vectorizer.transform([custom_text])

# Predict using the trained model
custom_prediction = model.predict(custom_text_vectorized)

# Decode the predicted sentiment
predicted_sentiment = encoder.inverse_transform([custom_prediction])[0]

# Optionally, if you have the actual sentiment for the custom string:
actual_sentiment = "POSITIVE"  # Replace with the actual sentiment for your custom string

# Calculate accuracy for the custom prediction (if you have the actual sentiment)
if actual_sentiment:
    accuracy_custom = 1 if custom_prediction == encoder.transform([[actual_sentiment]])[0] else 0
    print(f"Actual Sentiment: {actual_sentiment}")
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print(f"Accuracy for Custom String: {accuracy_custom}")
else:
    print(f"Predicted Sentiment: {predicted_sentiment}")
    print("Actual sentiment not provided, cannot calculate accuracy.")

Actual Sentiment: POSITIVE
Predicted Sentiment: ['NEGATIVE']
Accuracy for Custom String: 0
