<a href="https://colab.research.google.com/github/josh130588/MLAIMAR2024/blob/main/Sentiment_Analysis_on_Movie_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
import warnings

# Suppress convergence warnings
warnings.simplefilter('ignore', ConvergenceWarning)

# Load the dataset (assuming it's TSV)
try:
    df = pd.read_csv('labeledTrainData.tsv', sep='\t', error_bad_lines=False, warning_bad_lines=False)
except Exception as e:
    print(f"Error loading data: {e}")

# Inspect the dataset
print(df.head())

# Ensure the dataset has 'review' and 'sentiment' columns
if 'review' not in df.columns or 'sentiment' not in df.columns:
    raise ValueError("Dataset must contain 'review' and 'sentiment' columns")

# Prepare data
X = df['review']
y = df['sentiment']

# Feature Extraction with reduced max_features
vectorizer_tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer_tfidf.fit_transform(X)

# Dimensionality Reduction with fewer components
nmf = NMF(n_components=50, random_state=1, max_iter=200)
X_tfidf_nmf = nmf.fit_transform(X_tfidf)

# Train/Test Split with Stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf_nmf, y, test_size=0.2, random_state=1, stratify=y
)

# Standardize Data for Logistic Regression and SVM
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Evaluation Function
def evaluate_model(model, X_train, X_test, y_train, y_test):
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred, zero_division=0)

        print(f"Accuracy: {accuracy:.4f}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("Classification Report:")
        print(class_report)
    except ValueError as e:
        print(f"Error during model evaluation: {e}")

# Naive Bayes with TF-IDF with NMF Features
print("Naive Bayes with TF-IDF with NMF Features")
evaluate_model(MultinomialNB(), X_train, X_test, y_train, y_test)

# Logistic Regression with TF-IDF with NMF Features
print("\nLogistic Regression with TF-IDF with NMF Features")
evaluate_model(LogisticRegression(max_iter=1000, random_state=1), X_train_scaled, X_test_scaled, y_train, y_test)

# Support Vector Machine with TF-IDF with NMF Features
print("\nSupport Vector Machine with TF-IDF with NMF Features")
evaluate_model(SVC(), X_train_scaled, X_test_scaled, y_train, y_test)

Error loading data: read_csv() got an unexpected keyword argument 'error_bad_lines'
       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...
Naive Bayes with TF-IDF with NMF Features
Accuracy: 0.7282
Confusion Matrix:
[[1798  702]
 [ 657 1843]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.72      0.73      2500
           1       0.72      0.74      0.73      2500

    accuracy                           0.73      5000
   macro avg       0.73      0.73      0.73      5000
weighted avg       0.73      0.73      0.73      5000


Logistic Regression with TF-I