In [18]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load dataset
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
df.info

<bound method DataFrame.info of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [7]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [24]:
# Split the dataset into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=0)

In [25]:
# Convert text data to numerical data using CountVectorizer
vectorizer = CountVectorizer()
train_data = vectorizer.fit_transform(train_data)
test_data = vectorizer.transform(test_data)

In [26]:
# Initialize and train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(train_data, train_labels)

In [27]:
# Make predictions on the test set
predictions = classifier.predict(test_data)

In [28]:
# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.85


In [29]:
# Display classification report
print("Classification Report:\n", classification_report(test_labels, predictions))

Classification Report:
               precision    recall  f1-score   support

    negative       0.83      0.88      0.85      7540
    positive       0.87      0.82      0.84      7460

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



In [30]:
# Initialize and train the final model on the entire dataset
final_classifier = MultinomialNB()
final_classifier.fit(train_data, train_labels)

In [31]:
# Function to predict sentiment for a new review
def predict_sentiment(new_review):
    # Convert the new review to numerical data using the same vectorizer
    new_review_vectorized = vectorizer.transform([new_review])
    
    # Make prediction using the final model
    prediction = final_classifier.predict(new_review_vectorized)
    
    # Return the predicted sentiment
    return prediction[0]

In [36]:
# Example 1
new_review = "This movie was amazing, loved every moment of it!"
predicted_sentiment1 = predict_sentiment(new_review)

In [37]:
predicted_sentiment

'positive'

In [38]:
# Example 2
new_review = "This movie was bad, but I loved every moment of it!"
predicted_sentiment2 = predict_sentiment(new_review)

In [50]:
predicted_sentiment2

'negative'