In [8]:
from google.colab import drive
drive.mount('/content/drive')
import re
from sklearn.datasets import load_files
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Load the dataset
movie_data = load_files(r"/content/drive/MyDrive/Colab Notebooks/movie_reviews")
X, y = movie_data.data, movie_data.target

# Display dataset summary
print(f"Number of documents: {len(X)}")
print(f"Number of labels: {len(y)}")
print(f"Target names (classes): {movie_data.target_names}")
print(f"First document preview: {X[0].decode('utf-8')[:200]}...")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Number of documents: 2000
Number of labels: 2000
Target names (classes): ['neg', 'pos']
First document preview: arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . 
it's hard seeing arnold as mr ....


### Data Preprocessing

In [9]:
documents = []
for i in range(len(X)):
    # 1. Decode from bytes to string
    document = X[i].decode('utf-8')

    # 2. Apply regex substitutions
    document = re.sub(r'\W', ' ', document)  # Remove special characters
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)  # Single chars at beginning
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)  # Single chars in middle
    document = re.sub(r'\d+', '', document)  # Remove numbers
    document = re.sub(r'\s+', ' ', document, flags=re.I)  # Multiple spaces to one

    # 3. Convert to lowercase
    document = document.lower()

    # 4. Tokenize
    document = document.split()

    # 5. Lemmatize (reduce words to root form)
    document = [lemmatizer.lemmatize(word) for word in document]

    # 6. Rejoin tokens
    document = ' '.join(document)

    # 7. Append to processed documents
    documents.append(document)

print(f"Preprocessing complete. Sample processed document: {documents[0][:200]}...")


Preprocessing complete. Sample processed document: arnold schwarzenegger ha been an icon for action enthusiast since the late but lately his film have been very sloppy and the one liner are getting worse it hard seeing arnold a mr freeze in batman and...


### Text Vectorization

In [10]:
vectorizer = CountVectorizer(
    max_features=1500,  # Keep only top 1500 most frequent words
    min_df=7,          # Word must appear in at least 7 documents
    max_df=0.8,        # Word must appear in less than 80% of documents
    stop_words=stopwords.words('english')  # Remove common English stop words
)

X_vectors = vectorizer.fit_transform(documents).toarray()
print(f"Vectorized data shape: {X_vectors.shape}")
print(f"Sample features: {list(vectorizer.get_feature_names_out())[:20]}")


Vectorized data shape: (2000, 1500)
Sample features: ['ability', 'able', 'absolutely', 'academy', 'accent', 'accident', 'across', 'act', 'acting', 'action', 'actor', 'actress', 'actual', 'actually', 'ad', 'adam', 'adaptation', 'add', 'added', 'addition']


### Model Training and Evaluation

In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.2, random_state=0)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=0),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=0),
    'Support Vector Machine': SVC(kernel='linear', random_state=0),
    'Naive Bayes': MultinomialNB()
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")

    # Train model
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)

    # Store results
    results[name] = {
        'accuracy': accuracy,
        'predictions': predictions,
        'model': model
    }

    # Print detailed results
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("\nClassification Report:")
    print(classification_report(y_test, predictions, target_names=movie_data.target_names))



Training Logistic Regression...

Logistic Regression Results:
Accuracy: 0.8200

Confusion Matrix:
[[164  44]
 [ 28 164]]

Classification Report:
              precision    recall  f1-score   support

         neg       0.85      0.79      0.82       208
         pos       0.79      0.85      0.82       192

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400


Training Random Forest...

Random Forest Results:
Accuracy: 0.8275

Confusion Matrix:
[[168  40]
 [ 29 163]]

Classification Report:
              precision    recall  f1-score   support

         neg       0.85      0.81      0.83       208
         pos       0.80      0.85      0.83       192

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400


Training Support Vector Machine...

Support Vector Machine Result

### Performance Comparison

In [12]:
# Create comparison table
comparison_data = []
for name, result in results.items():
    predictions = result['predictions']

    # Calculate detailed metrics
    from sklearn.metrics import precision_recall_fscore_support
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='weighted')

    comparison_data.append({
        'Model': name,
        'Accuracy': f"{result['accuracy']:.4f}",
        'Precision': f"{precision:.4f}",
        'Recall': f"{recall:.4f}",
        'F1-Score': f"{f1:.4f}"
    })

# Display comparison
comparison_df = pd.DataFrame(comparison_data)
print("\n" + "="*60)
print("MODEL PERFORMANCE COMPARISON")
print("="*60)
print(comparison_df.to_string(index=False))

# Find best model
best_model_name = max(results.keys(), key=lambda x: results[x]['accuracy'])
print(f"\nBest Model: {best_model_name} with accuracy: {results[best_model_name]['accuracy']:.4f}")



MODEL PERFORMANCE COMPARISON
                 Model Accuracy Precision Recall F1-Score
   Logistic Regression   0.8200    0.8226 0.8200   0.8200
         Random Forest   0.8275    0.8289 0.8275   0.8276
Support Vector Machine   0.8100    0.8112 0.8100   0.8101
           Naive Bayes   0.8150    0.8162 0.8150   0.8151

Best Model: Random Forest with accuracy: 0.8275


New Data Pre Processings

In [6]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (113 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.9/113.9 kB[0m 

In [13]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download only the essential NLTK resources that work across all versions
try:
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')
except:
    pass

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def robust_enhanced_preprocessing(X):
    """Robust enhanced preprocessing that works across all NLTK versions"""
    documents = []

    # Get English stop words
    try:
        stop_words = set(stopwords.words('english'))
    except:
        # Fallback stop words if NLTK fails
        stop_words = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
                     'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
                     'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
                     'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
                     'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
                     'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
                     'while', 'of', 'at', 'by', 'for', 'with', 'through', 'during', 'before', 'after',
                     'above', 'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
                     'further', 'then', 'once'}

    # Preserve important negation words
    negation_words = {'not', 'no', 'never', 'neither', 'nor', 'none', 'nobody', 'nothing', 'nowhere'}
    stop_words = stop_words - negation_words

    for i in range(len(X)):
        # 1. Decode from bytes to string
        document = X[i].decode('utf-8')

        # 2. Basic HTML tag removal (simple regex approach)
        document = re.sub(r'<[^>]+>', ' ', document)

        # 3. Handle contractions manually (most common ones)
        contractions_dict = {
            "don't": "do not", "won't": "will not", "can't": "cannot",
            "n't": " not", "'re": " are", "'ve": " have", "'ll": " will",
            "'d": " would", "'m": " am", "it's": "it is", "that's": "that is",
            "there's": "there is", "here's": "here is", "what's": "what is",
            "where's": "where is", "how's": "how is", "let's": "let us"
        }

        for contraction, expansion in contractions_dict.items():
            document = document.replace(contraction, expansion)

        # 4. Handle negation (add NOT_ prefix to words after negation)
        document = handle_negation_robust(document)

        # 5. Enhanced regex cleaning
        document = re.sub(r'[^\w\s!?]', ' ', document)  # Keep ! and ? for emotion
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)  # Single chars at beginning
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)  # Single chars in middle
        document = re.sub(r'\d+', '', document)  # Remove numbers
        document = re.sub(r'!+', '!', document)  # Multiple ! to single !
        document = re.sub(r'\?+', '?', document)  # Multiple ? to single ?
        document = re.sub(r'\s+', ' ', document, flags=re.I)  # Multiple spaces to one

        # 6. Convert to lowercase
        document = document.lower()

        # 7. Tokenize
        tokens = document.split()

        # 8. Filter word length (remove very short/long words)
        tokens = [word for word in tokens if 2 <= len(word) <= 15]

        # 9. Remove stop words (preserving negation words)
        tokens = [word for word in tokens if word not in stop_words]

        # 10. Lemmatize with error handling
        lemmatized_tokens = []
        for word in tokens:
            try:
                lemmatized_tokens.append(lemmatizer.lemmatize(word))
            except:
                lemmatized_tokens.append(word)  # Keep original if lemmatization fails

        # 11. Join tokens back
        document = ' '.join(lemmatized_tokens)

        # 12. Final cleanup and append
        document = document.strip()
        if document:  # Only add non-empty documents
            documents.append(document)
        else:
            documents.append("empty")  # Placeholder for empty documents

    return documents

def handle_negation_robust(text):
    """Robust negation handling without external dependencies"""
    negation_words = ['not', 'no', 'never', 'neither', 'nor', 'none', 'nobody', 'nothing', 'nowhere']
    tokens = text.split()

    result = []
    negate = False

    for token in tokens:
        clean_token = re.sub(r'[^\w]', '', token.lower())  # Remove punctuation for comparison

        if clean_token in negation_words:
            negate = True
            result.append(token)
        elif negate and clean_token.isalpha() and len(clean_token) > 1:
            result.append(f"NOT_{clean_token}")
            negate = False
        else:
            result.append(token)
            # Reset negation at sentence boundaries
            if token.endswith(('.', '!', '?', ';')):
                negate = False

    return ' '.join(result)

# Usage - This will work without any NLTK compatibility issues
documents = robust_enhanced_preprocessing(X)
print(f"Robust enhanced preprocessing complete.\n Sample processed document: {documents[0][:200]}...")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Robust enhanced preprocessing complete.
 Sample processed document: arnold schwarzenegger icon action enthusiast since late lately film sloppy one liner getting worse hard seeing arnold mr freeze batman robin especially say ton ice joke hey got million matter arnold s...


## After Using Different Pre-Processing Techniques

In [14]:


vectorizer = CountVectorizer(
    max_features=1500,  # Keep only top 1500 most frequent words
    min_df=7,          # Word must appear in at least 7 documents
    max_df=0.8,        # Word must appear in less than 80% of documents
    stop_words=stopwords.words('english')  # Remove common English stop words
)

X_vectors = vectorizer.fit_transform(documents).toarray()
print(f"Vectorized data shape: {X_vectors.shape}")
print(f"Sample features: {list(vectorizer.get_feature_names_out())[:20]}")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.2, random_state=0)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=0),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=0),
    'Support Vector Machine': SVC(kernel='linear', random_state=0),
    'Naive Bayes': MultinomialNB()
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training {name}...")

    # Train model
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)

    # Store results
    results[name] = {
        'accuracy': accuracy,
        'predictions': predictions,
        'model': model
    }

    # Print detailed results
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("\nClassification Report:")
    print(classification_report(y_test, predictions, target_names=movie_data.target_names))

# Create comparison table
comparison_data = []
for name, result in results.items():
    predictions = result['predictions']

    # Calculate detailed metrics
    from sklearn.metrics import precision_recall_fscore_support
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, predictions, average='weighted')

    comparison_data.append({
        'Model': name,
        'Accuracy': f"{result['accuracy']:.4f}",
        'Precision': f"{precision:.4f}",
        'Recall': f"{recall:.4f}",
        'F1-Score': f"{f1:.4f}"
    })

# Display comparison
comparison_df = pd.DataFrame(comparison_data)
print("\n" + "="*60)
print("MODEL PERFORMANCE COMPARISON")
print("="*60)
print(comparison_df.to_string(index=False))

# Find best model
best_model_name = max(results.keys(), key=lambda x: results[x]['accuracy'])
print(f"\nBest Model: {best_model_name} with accuracy: {results[best_model_name]['accuracy']:.4f}")


Vectorized data shape: (2000, 1500)
Sample features: ['ability', 'able', 'absolutely', 'academy', 'accent', 'accident', 'across', 'act', 'acting', 'action', 'actor', 'actress', 'actual', 'actually', 'adam', 'adaptation', 'add', 'added', 'addition', 'admit']

Training Logistic Regression...

Logistic Regression Results:
Accuracy: 0.8100

Confusion Matrix:
[[160  48]
 [ 28 164]]

Classification Report:
              precision    recall  f1-score   support

         neg       0.85      0.77      0.81       208
         pos       0.77      0.85      0.81       192

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400


Training Random Forest...

Random Forest Results:
Accuracy: 0.8200

Confusion Matrix:
[[160  48]
 [ 24 168]]

Classification Report:
              precision    recall  f1-score   support

         neg       0.87      0.77      0.82       208
         pos       0.78   