In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [3]:
import micropip
await micropip.install("seaborn")
import seaborn as sns

In [4]:
import nltk
from nltk.corpus import stopwords

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [18]:
import re
stop_words = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
    'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
    'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
    'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
    'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing',
    'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as',
    'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about',
    'against', 'between', 'into', 'through', 'during', 'before', 'after',
    'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
    'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
    'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any',
    'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
    'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too',
    'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'
])




In [38]:
data = pd.read_csv("movie_reviews.csv", sep='\t', engine='python', on_bad_lines = 'skip')
print(data.shape)
print(data.head())


(1537, 1)
                                    review,sentiment
0  This a fantastic movie of three prisoners who ...
1  This was the worst movie I saw at WorldFest an...
2  Protocol is an implausible movie whose only sa...
3  This was probably the worst movie i have ever ...
4  Oh noes one of these attack of the Japanese gh...


In [39]:
import csv

data = pd.read_csv("movie_reviews.csv", quoting=csv.QUOTE_NONE, on_bad_lines = 'skip', encoding='utf-8')

In [44]:
def clean_text(text):
    if not isinstance(text, str):
        text = ""
    text = re.sub(r'<[^>]*>', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    words = [w for w in text.split() if w not in stop_words]
    return ' '.join(words)

data['clean_review'] = data['review'].apply(clean_text)

In [41]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words=None)

X = vectorizer.fit_transform(data['clean_review'])
y = data['sentiment'].apply(lambda x:1 if str(x).lower() == 'positive' else 0)

print(X.shape)

(45999, 1725)


In [63]:
data = data.dropna(subset=['sentiment', 'clean_review'])

In [64]:
X = data['clean_review']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

In [65]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [66]:
model = LogisticRegression(max_iter=5000)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.5694444444444444

Classification Report:
               precision    recall  f1-score   support

    negative       0.69      0.30      0.42        37
    positive       0.54      0.86      0.66        35

    accuracy                           0.57        72
   macro avg       0.61      0.58      0.54        72
weighted avg       0.61      0.57      0.53        72


Confusion Matrix:
 [[11 26]
 [ 5 30]]


In [67]:
results = pd.DataFrame({
    'review': X_test,
    'actual_sentiment': y_test,
    'predicted_sentiment': y_pred
})

In [68]:
misclassified = results[results['actual_sentiment'] != results['predicted_sentiment']]

In [70]:
misclassified.tail(1)

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,review,actual_sentiment,predicted_sentiment
"""In trying to keep up with the hipness of youthful audiences as the 70s approached",OaCD,YCSF was the product of odder and odder material selected for musicalization. Here it's past life regression,ESP and hypno-therapy... pretty loopy! The real problem with the concept (music or not) are the extraordinarily low dramatic stakes; just where can a movie go,and what can happen,when a man falls in love with a previous incarnation of a girl he can't stand? It can't go any place new,but strangely,it can't even go any place old! Indeed,if it could,audiences would still have no interest in the union of Yves Montand (playing a much older,arrogant,French ass) and Streisand. (a much younger girl). We never become invested in them,their situations or outcomes. Montand is miscast and his strong accent makes many of his lyrics unintelligible.<br /><br />It's all been given a shallow 60s veneer that makes it eminently disposable; despite efforts here and there from Minelli that are respectable. It's not even adapted from a non-musical story that met with any previous success... that's just too passe! Streisand occasionally has some funny business to offer,as when she's trying not to fall asleep on her roof and improvises an energetic dance. But she over-relies on her ingratiating (translation: irritating) kooky,Jewish girl shtick. She can however sing very well,"at both the """"gentle"""" and """"powerhouse"""" ends of the range. Amidst a score of musical dross",she gets 3 or 4 amazing songs* of much higher caliber than anything Fanny or Dolly had to offer. 'He isn't you' is a sweet trifle as sublime as Lorenz Hart's 'My Funny Valentine,' but the movie isn't able to realize any impact from it; because the lyrics don't seem to be referring to anything in the movie,and nothing remotely suggests a great love is blossoming between Chabot and Melinda.<br /><br />The only cut we can view is a poor hatchet job of a much bigger film. Strong research shows a longer,better-explained and more decorative,but not necessarily a better film at: http://barbra-archives.com/films/clear_day_streisand_2.html. You can be sure there's be more Babs in that version but more importantly,there'd be more thoughtful work from Minelli.<br /><br />In the end Montand sends Babs off to sing the title song,after she discovers he's a total dick who feeds her a self-esteem homily to allow himself off the hook. And she takes the bait. So,uh... hooray for that.<br /><br />(*Hurry it's lovely up here,Love with all the trimmings,He isn't you,title song,negative,positive
