In [17]:
!pip install -U scikit-learn nltk
!pip install tensorflow




In [18]:
from tensorflow.keras.datasets import imdb
import numpy as np

# Load the IMDB dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

# Check the dataset
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"Unique labels: {np.unique(y_train)}")


Training samples: 25000
Testing samples: 25000
Unique labels: [0 1]


In [19]:
# Load the word index
word_index = imdb.get_word_index()

# Create a reverse word index
reverse_word_index = {value: key for (key, value) in word_index.items()}

# Decoding a sample
def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])  # Adjust for off-by-three

# Example
print(decode_review(X_train[0]))  # Decoded review from the first training example


? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you thi

In [20]:
import string
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing
X_train_processed = [preprocess_text(decode_review(review)) for review in X_train]
X_test_processed = [preprocess_text(decode_review(review)) for review in X_test]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # using unigram and bigram

# Fit and transform training data
X_train_tfidf = vectorizer.fit_transform(X_train_processed)

# Transform test data
X_test_tfidf = vectorizer.transform(X_test_processed)

# Shape of feature matrices
print(f"Train set shape: {X_train_tfidf.shape}")
print(f"Test set shape: {X_test_tfidf.shape}")


Train set shape: (25000, 5000)
Test set shape: (25000, 5000)


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Model selection
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC()
}

# Train models and evaluate
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    print(f"Accuracy for {model_name}: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))



Training Logistic Regression...
Accuracy for Logistic Regression: 0.88496
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     12500
           1       0.88      0.89      0.89     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

[[11018  1482]
 [ 1394 11106]]

Training Naive Bayes...
Accuracy for Naive Bayes: 0.85276
              precision    recall  f1-score   support

           0       0.85      0.85      0.85     12500
           1       0.85      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

[[10674  1826]
 [ 1855 10645]]

Training SVM...
Accuracy for SVM: 0.88492
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     1250

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Define pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('classifier', LogisticRegression())  # Replace with any model you want
])

# Hyperparameter grid
param_grid = {
    'vectorizer__max_features': [5000, 10000],
    'classifier__C': [0.1, 1, 10]  # Regularization for Logistic Regression
}

# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train_processed, y_train)

# Best model and parameters
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")


Best Parameters: {'classifier__C': 1, 'vectorizer__max_features': 10000}
Best Score: 0.8871599596405263


In [23]:
# Example
custom_reviews = [
    "I love this movie, it's amazing!",
    "This film was a waste of time."
]

# Preprocess the review
custom_reviews_processed = [preprocess_text(review) for review in custom_reviews]


#  Use the best model
model = grid_search.best_estimator_

#  Predict sentiment for the custom reviews

predictions = model.predict(custom_reviews_processed)

# Display the result
for review, pred in zip(custom_reviews, predictions):
    sentiment = 'Positive' if pred == 1 else 'Negative'
    print(f"Review: {review}")
    print(f"Prediction: {sentiment}\n")

Review: I love this movie, it's amazing!
Prediction: Positive

Review: This film was a waste of time.
Prediction: Negative

