In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')
data=pd.read_csv('drive/My Drive/IMDB Dataset.csv')
data.replace({'sentiment': {'positive': 1, 'negative': 0}}, inplace=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
def preprocess_text(review):
    clean_text = BeautifulSoup(review, 'html.parser').get_text()
    clean_text = re.sub('[^A-Za-z]+', ' ', clean_text)
    clean_text = clean_text.lower()
    clean_text = clean_text.split()
    stop_words = set(stopwords.words('english'))
    for stop_word in stop_words:
        while stop_word in clean_text:
            clean_text.remove(stop_word)
    return ' '.join(clean_text)


In [7]:
data['cleaned_review'] = data['review'].apply(preprocess_text)
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_review'], data['sentiment'], test_size=0.25, random_state=42)

  clean_text = BeautifulSoup(review, 'html.parser').get_text()


In [13]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [14]:
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

In [15]:
y_pred = classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [16]:
print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.89088
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      6157
           1       0.88      0.90      0.89      6343

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500



In [19]:
def predict_sentiment(text):
    cleaned_text = preprocess_text(text)
    text_tfidf = tfidf.transform([cleaned_text])
    prediction = classifier.predict(text_tfidf)
    sentiment = 'positive' if prediction[0] == 1 else 'negative'
    return sentiment

**Testing the model on my reviews**

In [23]:
new_review = "The magical world of Harry Potter is a mesmerizing journey filled with enchanting characters and thrilling adventures that captivate the imagination."
prediction = predict_sentiment(new_review)
print(f'The predicted sentiment is: {prediction}')

The predicted sentiment is: positive


In [24]:
new_review = "Harry Potter's storyline feels overly stretched, with predictable plot twists and an underwhelming conclusion."
prediction = predict_sentiment(new_review)
print(f'The predicted sentiment is: {prediction}')

The predicted sentiment is: negative
