## Installing requirements

In [1]:
!pip install pandas numpy nltk matplotlib seaborn plotly scikit-learn
!pip install kaleido




# Model Comparison

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import re
from time import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

class ImprovedModelComparison:
    def __init__(self, sample_size=10000):
        self.sample_size = sample_size
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

        self.vectorizer = TfidfVectorizer(
            max_features=15000,
            min_df=2,
            max_df=0.95,
            ngram_range=(1, 3),
            stop_words='english',
            sublinear_tf=True
        )

        self.scaler = StandardScaler()

        self.models = {
            'Logistic Regression': LogisticRegression(
                max_iter=2000,
                random_state=42,
                C=2.0,
                solver='liblinear',
                class_weight='balanced'
            ),
            'Linear SVM': LinearSVC(
                max_iter=2000,
                random_state=42,
                C=1.0,
                class_weight='balanced'
            ),
            'Naive Bayes': MultinomialNB(alpha=0.1),
            'Random Forest': RandomForestClassifier(
                n_estimators=100,
                random_state=42,
                max_depth=20,
                min_samples_split=5,
                class_weight='balanced'
            ),
            'Neural Network': MLPClassifier(
                hidden_layer_sizes=(100, 50),
                max_iter=1000,
                random_state=42,
                alpha=0.001
            )
        }

        self.results = {}

    def preprocess_text(self, text):
        if pd.isna(text) or text == '':
            return ''

        text = str(text).lower()

        contractions = {
            "won't": "will not", "can't": "cannot", "n't": " not",
            "'re": " are", "'ve": " have", "'ll": " will",
            "'d": " would", "'m": " am", "it's": "it is",
            "that's": "that is", "what's": "what is", "there's": "there is"
        }

        for contraction, expansion in contractions.items():
            text = text.replace(contraction, expansion)

        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        try:
            tokens = word_tokenize(text)
        except:
            tokens = text.split()

        negation_words = {"not", "no", "never", "none", "nobody", "nothing",
                         "neither", "nowhere", "hardly", "scarcely", "barely"}

        processed_tokens = []
        negate = False

        for i, token in enumerate(tokens):
            if token in negation_words:
                negate = True
                processed_tokens.append(token)
            elif negate and i < len(tokens):
                processed_tokens.append(f"NOT_{token}")
                negate = False
            elif token not in self.stop_words or len(token) <= 2:
                try:
                    lemmatized = self.lemmatizer.lemmatize(token)
                    if len(lemmatized) > 2:
                        processed_tokens.append(lemmatized)
                except:
                    if len(token) > 2:
                        processed_tokens.append(token)

        return ' '.join(processed_tokens)

    def prepare_data(self, data_path):
        print("Loading dataset...")
        df = pd.read_csv(data_path)

        print(f"Original dataset size: {len(df):,}")
        df = df.drop_duplicates(subset=['review'])
        df = df.dropna(subset=['review', 'sentiment'])
        df = df[df['review'].str.len() > 0]
        print(f"After cleaning: {len(df):,}")

        if len(df) > self.sample_size:
            df = df.sample(n=self.sample_size, random_state=42)

        print(f"Using {len(df):,} reviews for analysis")

        sentiment_counts = df['sentiment'].value_counts()
        print(f"Sentiment distribution:")
        for sentiment, count in sentiment_counts.items():
            print(f"  {sentiment}: {count:,} ({count/len(df)*100:.1f}%)")

        processed_reviews = []
        batch_size = 1000

        print("Processing reviews...")
        for i in range(0, len(df), batch_size):
            batch = df['review'].iloc[i:i+batch_size]
            processed_batch = [self.preprocess_text(review) for review in batch]
            processed_reviews.extend(processed_batch)
            print(f"Processed {min(i+batch_size, len(df)):,} reviews")

        df['processed_review'] = processed_reviews
        df = df[df['processed_review'].str.len() > 0]
        print(f"Final dataset size: {len(df):,}")

        X_train, X_test, y_train, y_test = train_test_split(
            df['processed_review'],
            df['sentiment'],
            test_size=0.2,
            random_state=42,
            stratify=df['sentiment']
        )

        print("Vectorizing text...")
        X_train_vec = self.vectorizer.fit_transform(X_train)
        X_test_vec = self.vectorizer.transform(X_test)

        self.X_train_raw = X_train_vec.toarray()
        self.X_test_raw = X_test_vec.toarray()

        print("Applying feature scaling...")
        X_train_scaled = self.scaler.fit_transform(self.X_train_raw)
        X_test_scaled = self.scaler.transform(self.X_test_raw)

        print("Data preparation completed successfully")
        return X_train_scaled, X_test_scaled, y_train, y_test

    def train_and_evaluate(self, X_train, X_test, y_train, y_test):
        print("\n" + "="*60)
        print("TRAINING AND EVALUATING MODELS")
        print("="*60)

        for name, model in self.models.items():
            print(f"\nTraining {name}...")
            start_time = time()

            try:

                if name == 'Naive Bayes':
                    # Naive Bayes works with raw TF-IDF (non-negative)
                    X_train_model = self.X_train_raw
                    X_test_model = self.X_test_raw
                else:
                    # Other models can use scaled data
                    X_train_model = X_train
                    X_test_model = X_test

                model.fit(X_train_model, y_train)
                y_pred = model.predict(X_test_model)
                training_time = time() - start_time

                class_report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

                self.results[name] = {
                    'accuracy': accuracy_score(y_test, y_pred),
                    'training_time': training_time,
                    'confusion_matrix': confusion_matrix(y_test, y_pred),
                    'classification_report': class_report,
                    'precision': class_report['weighted avg']['precision'],
                    'recall': class_report['weighted avg']['recall'],
                    'f1_score': class_report['weighted avg']['f1-score']
                }

                print(f"✓ {name} completed in {training_time:.2f}s - Accuracy: {self.results[name]['accuracy']:.4f}")

            except Exception as e:
                print(f"✗ {name} failed: {str(e)}")
                continue

    def generate_detailed_report(self):
        if not self.results:
            print("No results to report.")
            return None

        print("\n" + "="*80)
        print("DETAILED MODEL COMPARISON REPORT")
        print("="*80)

        comparison_data = []
        for name, results in self.results.items():
            comparison_data.append({
                'Model': name,
                'Accuracy': results['accuracy'],
                'Precision': results['precision'],
                'Recall': results['recall'],
                'F1-Score': results['f1_score'],
                'Training Time (s)': results['training_time']
            })

        df_comparison = pd.DataFrame(comparison_data)
        df_comparison = df_comparison.sort_values('Accuracy', ascending=False)

        print("\nOverall Performance Ranking:")
        print("-" * 80)
        print(df_comparison.to_string(index=False, float_format='%.4f'))

        if len(df_comparison) > 0:
            best_model = df_comparison.iloc[0]
            print(f"\n🏆 BEST MODEL: {best_model['Model']}")
            print(f"   Accuracy: {best_model['Accuracy']:.4f}")
            print(f"   Precision: {best_model['Precision']:.4f}")
            print(f"   Recall: {best_model['Recall']:.4f}")
            print(f"   F1-Score: {best_model['F1-Score']:.4f}")
            print(f"   Training Time: {best_model['Training Time (s)']:.2f}s")

        return df_comparison

    def find_best_model(self):
        if not self.results:
            return None, None

        best_accuracy = 0
        best_model = None

        for name, results in self.results.items():
            if results['accuracy'] > best_accuracy:
                best_accuracy = results['accuracy']
                best_model = name

        return best_model, best_accuracy

if __name__ == "__main__":
    print("Starting Model Comparison...")
    print("="*60)

    try:
        comparison = ImprovedModelComparison(sample_size=20000)

        X_train, X_test, y_train, y_test = comparison.prepare_data('IMDB_Dataset.csv')
        comparison.train_and_evaluate(X_train, X_test, y_train, y_test)
        df_comparison = comparison.generate_detailed_report()
        best_model, best_accuracy = comparison.find_best_model()

        if best_model:
            print(f"\n🎉 FINAL RECOMMENDATION: Use {best_model} with {best_accuracy:.1%} accuracy")

        # Results for Streamlit
        if comparison.results:
            print("\n" + "="*60)
            print("RESULTS FOR STREAMLIT APP:")
            print("="*60)
            print("MODEL_COMPARISON_RESULTS = {")
            for name, results in comparison.results.items():
                print(f"    '{name}': {{'accuracy': {results['accuracy']:.4f}, 'training_time': {results['training_time']:.2f}, 'precision': {results['precision']:.4f}, 'recall': {results['recall']:.4f}, 'f1_score': {results['f1_score']:.4f}}},")
            print("}")

    except Exception as e:
        print(f"\n Error: {str(e)}")

Starting Model Comparison...
Loading dataset...
Original dataset size: 50,000
After cleaning: 49,582
Using 20,000 reviews for analysis
Sentiment distribution:
  positive: 10,012 (50.1%)
  negative: 9,988 (49.9%)
Processing reviews...
Processed 1,000 reviews
Processed 2,000 reviews
Processed 3,000 reviews
Processed 4,000 reviews
Processed 5,000 reviews
Processed 6,000 reviews
Processed 7,000 reviews
Processed 8,000 reviews
Processed 9,000 reviews
Processed 10,000 reviews
Processed 11,000 reviews
Processed 12,000 reviews
Processed 13,000 reviews
Processed 14,000 reviews
Processed 15,000 reviews
Processed 16,000 reviews
Processed 17,000 reviews
Processed 18,000 reviews
Processed 19,000 reviews
Processed 20,000 reviews
Final dataset size: 20,000
Vectorizing text...
Applying feature scaling...
Data preparation completed successfully

TRAINING AND EVALUATING MODELS

Training Logistic Regression...
✓ Logistic Regression completed in 82.68s - Accuracy: 0.8393

Training Linear SVM...
✓ Linear S