<a href="https://colab.research.google.com/github/jblaszka/ML/blob/main/Movie_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import numpy as np
import pandas as pd
import plotly.express as px
import sklearn

np.random.seed(42)
np.set_printoptions(precision=6, suppress=True, edgeitems=10, linewidth=1000, formatter=dict(float=lambda x: f'{x:.2f}'))
sklearn.__version__

'0.22.2.post1'

In [45]:
import nltk

nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [46]:
from sklearn.datasets import load_files

raw_movie = load_files('movie_reviews')
movie = raw_movie.copy()
movie.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(movie['data'],
                                   movie['target'], random_state = 42)

print(f'X_train : {len(X_train)}')
print(f'X_test : {len(X_test)}')

X_train : 1500
X_test : 500


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=3000)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

print(f'X_train shape : {X_train.shape}')
print(f'X_test shape: {X_test.shape}')

X_train shape : (1500, 3000)
X_test shape: (500, 3000)


In [49]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

0.804

In [50]:
from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[204,  36],
       [ 62, 198]])

In [51]:
import plotly.figure_factory as ff

def plot_confusion_matrix(cm):
    cm = cm[::-1]
    cm = pd.DataFrame(cm, columns=['negative', 'positive'], index=['positive', 'negative'])

    fig = ff.create_annotated_heatmap(z=cm.values, x=list(cm.columns), y=list(cm.index), 
                                      colorscale='ice', showscale=True, reversescale=True)
    fig.update_layout(width=400, height=400, title='Confusion Matrix', font_size=16)
    fig.show()

plot_confusion_matrix(cm)

In [52]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

              precision    recall  f1-score   support

    negative       0.77      0.85      0.81       240
    positive       0.85      0.76      0.80       260

    accuracy                           0.80       500
   macro avg       0.81      0.81      0.80       500
weighted avg       0.81      0.80      0.80       500



In [53]:
new_reviews = ['It was so good! Amazing soundtrack', 
               'Very long. Don\'t waste your time.',
               'Horrible movie',
               'Amazing storyline']

new_reviews_tfidf = tfidf.transform(new_reviews)

In [54]:
new_reviews_pred = classifier.predict(new_reviews_tfidf)
new_reviews_pred

array([1, 0, 0, 1])

In [55]:
new_reviews_prob = classifier.predict_proba(new_reviews_tfidf)
new_reviews_prob

array([[0.45, 0.55],
       [0.71, 0.29],
       [0.64, 0.36],
       [0.47, 0.53]])

In [56]:
np.argmax(new_reviews_prob, axis=1)

array([1, 0, 0, 1])

In [57]:
for review, target, prob in zip(new_reviews, new_reviews_pred, new_reviews_prob):
    print(f"{review} -> {movie['target_names'][target]} -> {prob[target]:.4f}")

It was so good! Amazing soundtrack -> pos -> 0.5531
Very long. Don't waste your time. -> neg -> 0.7067
Horrible movie -> neg -> 0.6425
Amazing storyline -> pos -> 0.5348
