In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.svm import SVC  # Import the Support Vector Classifier

In [2]:
# Loading data from a CSV file
df = pd.read_csv('balanced_imdb_reviews_20240205-173555.csv')  # Replace with your actual file path

# Handling missing or empty strings in 'review_detail'
print("Missing or empty 'review_detail' values per sentiment category:")
missing_or_empty_count = df[df['review_detail'].isna() | (df['review_detail'] == '')].groupby('sentiment').size()
print(missing_or_empty_count)
df['review_detail'] = df['review_detail'].fillna('')  # Fill NaN values

# Downsampling to 5,000 samples per sentiment category
sampled_df = pd.DataFrame()  # Initialize an empty dataframe
for sentiment in df['sentiment'].unique():
    sampled_df = pd.concat([sampled_df, df[df['sentiment'] == sentiment].sample(n=15000, random_state=42)])

# Resetting the index of the sampled dataframe
sampled_df = sampled_df.reset_index(drop=True)

print(f"New dataset size: {sampled_df.shape}")
print(sampled_df['sentiment'].value_counts())



Missing or empty 'review_detail' values per sentiment category:
sentiment
NEGATIVE    1
dtype: int64
New dataset size: (45000, 5)
sentiment
POSITIVE    15000
MEDIUM      15000
NEGATIVE    15000
Name: count, dtype: int64


In [3]:
texts = sampled_df['review_detail'].values
sentiments = sampled_df['sentiment'].values

# Encoding ordinal categories
encoder = OrdinalEncoder(categories=[["NEGATIVE", "MEDIUM", "POSITIVE"]])
y = encoder.fit_transform(sentiments.reshape(-1, 1)).flatten()  # Reshape is needed for a single feature

# Vectorizing text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check if the test set is too small
if X_test.shape[0] < 1:
    raise ValueError("Test set is too small. Consider reducing the test_size parameter or adding more data.")

# Training a model using SVM
model = SVC(kernel='sigmoid', C=1.0, probability=True, random_state=42)  # Initialize the SVM model
model.fit(X_train, y_train)

In [4]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import numpy as np

# Predicting hard class labels for accuracy and classification report
predictions = model.predict(X_test)

# Predicting probabilities for ROC AUC score calculation
probabilities = model.predict_proba(X_test)

# Evaluating with accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Calculating ROC AUC score for multi-class using the 'ovr' strategy with probabilities
ra = roc_auc_score(y_test, probabilities, multi_class='ovr', average='macro')
print("ROC AUC score:", ra)

# Generating and printing the classification report with correct target names
# Ensure the target names match your classes correctly
report = classification_report(y_test, predictions, target_names=["NEGATIVE", "MEDIUM", "POSITIVE"])
print("Classification Report:")
print(report)


Accuracy: 0.6984444444444444
ROC AUC score: 0.8645154309136474
Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.75      0.77      0.76      3006
      MEDIUM       0.61      0.59      0.60      2988
    POSITIVE       0.73      0.73      0.73      3006

    accuracy                           0.70      9000
   macro avg       0.70      0.70      0.70      9000
weighted avg       0.70      0.70      0.70      9000

