# Create test data sample with 33k rows per sentiment

In [11]:
import pandas as pd
import time

# Load data from CSV
df = pd.read_csv('../../data/gold/imdb_negotions_merged/merged_imdb_data.csv')  # Adjust path accordingly

df.head()

# Balance the dataset to contain 33k samples of each sentiment
sample_size = 33000  # Desired sample size of each category

# Sample 33k instances of each category
df_balanced = pd.concat([
    df[df['sentiment'] == 'POSITIVE'].sample(n=sample_size, random_state=42),
    df[df['sentiment'] == 'MEDIUM'].sample(n=sample_size, random_state=42),
    df[df['sentiment'] == 'NEGATIVE'].sample(n=sample_size, random_state=42)
])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a new CSV file
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
df_balanced.to_csv(f'balanced_imdb_reviews_{timestr}.csv', index=False)


KeyboardInterrupt: 

# logistic regression

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder

In [3]:
# Loading data from a CSV file
df = pd.read_csv('balanced_imdb_reviews_20240205-173555.csv')  # Replace 'path_to_your_file.csv' with your actual file path

# Count the number of NaN or empty strings in 'review_detail' per 'sentiment' category
missing_or_empty_count = df[df['review_detail'].isna() | (df['review_detail'] == '')].groupby('sentiment').size()

# Print the counts
print("Missing or empty 'review_detail' values per sentiment category:")
print(missing_or_empty_count)
df['review_detail'] = df['review_detail'].fillna('')

Missing or empty 'review_detail' values per sentiment category:
sentiment
NEGATIVE    1
dtype: int64


In [4]:
texts = df['review_detail'].values
sentiments = df['sentiment'].values

# Encoding ordinal categories
encoder = OrdinalEncoder(categories=[["NEGATIVE", "MEDIUM", "POSITIVE"]])
y = encoder.fit_transform(sentiments.reshape(-1, 1)).flatten()  # Reshape is needed for a single feature

# Vectorizing text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check if the test set is too small
if X_test.shape[0] < 1:
    raise ValueError("Test set is too small. Consider reducing the test_size parameter or adding more data.")

# Training a model
model = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=2000)  # Added max_iter for convergence
model.fit(X_train, y_train)

In [6]:
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import numpy as np

# Predicting hard class labels for accuracy and classification report
predictions = model.predict(X_test)

# Predicting probabilities for ROC AUC score calculation
probabilities = model.predict_proba(X_test)

# Evaluating with accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Calculating ROC AUC score for multi-class using the 'ovr' strategy with probabilities
ra = roc_auc_score(y_test, probabilities, multi_class='ovr', average='macro')
print("ROC AUC score:", ra)

# Generating and printing the classification report with correct target names
# Ensure the target names match your classes correctly
report = classification_report(y_test, predictions, target_names=["NEGATIVE", "MEDIUM", "POSITIVE"])
print("Classification Report:")
print(report)


Accuracy: 0.7188888888888889
ROC AUC score: 0.8791859663798297
Classification Report:
              precision    recall  f1-score   support

    NEGATIVE       0.78      0.78      0.78      6714
      MEDIUM       0.63      0.62      0.63      6590
    POSITIVE       0.74      0.75      0.75      6496

    accuracy                           0.72     19800
   macro avg       0.72      0.72      0.72     19800
weighted avg       0.72      0.72      0.72     19800

