In [57]:
!pip install pandas scikit-learn nltk joblib



In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import joblib

In [59]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
data = pd.read_csv('/content/drive/MyDrive/dataset.csv')

In [61]:
# Text cleaning and preprocessing
def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text)
    # Remove stop words and apply stemming
    words = [PorterStemmer().stem(word.lower()) for word in words if word.lower() not in stop_words]
    return ' '.join(words)

data['review'] = data['review'].apply(preprocess_text)  # Assuming 'review' is the column with movie reviews


In [62]:
train_data, test_data, train_labels, test_labels = train_test_split(
    data['review'], data['sentiment'], test_size=0.2, random_state=42
)

In [63]:
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

In [64]:
model = LogisticRegression()
model.fit(train_vectors, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [65]:
joblib.dump(model, '/content/drive/MyDrive/sentiment_model.joblib')

['/content/drive/MyDrive/sentiment_model.joblib']

In [66]:
predictions = model.predict(test_vectors)
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy:.2f}')
print('\nClassification Report:\n', classification_report(test_labels, predictions))

Accuracy: 0.89

Classification Report:
               precision    recall  f1-score   support

    negative       0.90      0.87      0.89      4961
    positive       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [None]:
ser_input = input("Enter a movie review: ")
processed_input = preprocess_text(user_input)
input_vector = vectorizer.transform([processed_input])
prediction = model.predict(input_vector)[0]
print(f'The sentiment of the review is: {prediction}')