In [None]:
# Load the IMDb movie reviews dataset and check its structure.

import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/dDua/sample-datasets/main/imdb-reviews.csv"
data = pd.read_csv(url)

# Display the first few rows
print("Dataset Sample:")
print(data.head())

# Check for null values
print("\nDataset Info:")
print(data.info())

# Columns:
# 'review': The text of the review.
# 'sentiment': The sentiment label (positive or negative).

In [None]:
# Preprocess the text data by cleaning, removing special characters, and converting to lowercase.

import re

def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()  # Convert to lowercase
    return text

# Apply text preprocessing
data['cleaned_review'] = data['review'].apply(clean_text)

# Display the cleaned text
print("\nCleaned Review Example:")
print(data['cleaned_review'].iloc[0])

In [None]:
# Split the dataset into training and testing sets.

from sklearn.model_selection import train_test_split

# Define features and labels
X = data['cleaned_review']
y = data['sentiment']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining Data Size:", X_train.shape[0])
print("Testing Data Size:", X_test.shape[0])

In [None]:
# Convert the cleaned text into numerical features using TF-IDF vectorization.

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words

# Fit-transform on training data and transform test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("\nTF-IDF Matrix Shape (Training):", X_train_tfidf.shape)
print("TF-IDF Matrix Shape (Testing):", X_test_tfidf.shape)

In [None]:
# Train a Naive Bayes model for sentiment classification.

from sklearn.naive_bayes import MultinomialNB

# Initialize and train the Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Evaluate the model
train_accuracy = nb_model.score(X_train_tfidf, y_train)
test_accuracy = nb_model.score(X_test_tfidf, y_test)

print("\nTraining Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

In [None]:
# Evaluate the model performance using confusion matrix and classification report.

from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Make predictions on the test set
y_pred = nb_model.predict(X_test_tfidf)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Test the model on new reviews.

new_reviews = [
    "This movie was absolutely wonderful and heartwarming.",
    "I hated this film. It was a complete waste of time.",
    "The storyline was decent, but the acting could have been better."
]

# Preprocess and transform the new reviews
new_reviews_cleaned = [clean_text(review) for review in new_reviews]
new_reviews_tfidf = tfidf_vectorizer.transform(new_reviews_cleaned)

# Predict sentiment
predictions = nb_model.predict(new_reviews_tfidf)

# Display predictions
for review, sentiment in zip(new_reviews, predictions):
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}\n")