In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

#Reduced data Size.
data = {
    'review': [
        "I loved the movie", "Horrible acting", "What a great film", "Worst movie ever", "Really enjoyed it", "It was terrible", "Fantastic performance",
        "Not good at all", "It was entertaining", "I was dissapointed", "I was excited while watching", "It was a dreadful experience", "The movie was very engaging",
        "The film was unorigional", "It was really thought provoking", "The acting felt stiff", "The movie was thrilling", "The CGI was really bad",
        "I really enjoyed the action", "The dialog was very boring", "I felt engaged the whole time", "It put me to sleep", "The ending was really good",
        "The movie aged poorly", "The comedy was great", "The acting was bad", "It kept me on my toes", "The worst performance i have ever seen",
        "I had quite the laugh", "It was really boring"
    ],

    'label': [
        'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative',
        'positive','negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative',
        'positive', 'negative','positive', 'negative', 'positive', 'negative', 'positive', 'negative',
        'positive', 'negative','positive', 'negative', 'positive', 'negative'
    ]
}

df = pd.DataFrame(data)

#Cleaning data, stripping upercase for uniformity and removing whitespace.
df ['review'] = df['review'].str.lower()
df ['review'] = df['review'].str.strip()

#Converting text to a numerical format.
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['review'])
y = df ['label']

#Splitting data between training and testing variables.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Model training.
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
#Setting a sample prediction of reviews to test the variation.
my_reviews = [
    "Worst movie ever", "Really enjoyed it", "I loved the movie", "It was terrible", "I loved the movie"
    ]
review_vector = vectorizer.transform(my_reviews)
print("Prediction:", model.predict(review_vector))

Prediction: ['negative' 'positive' 'positive' 'positive' 'positive']


In [None]:
y_true = [
    'negative', 'positive', 'positive', 'negative', 'positive'
    ]

#Tests the accuracy of positive predictions.
#Despite only 4/5 predictions being correct all correct positives match y_true.
print("Precision:", precision_score(model.predict(review_vector), y_true, pos_label='positive'))

#Tests the models accuracy in finding all positive predictions.
#4/5 Instances were "positive" or correctly predicted rounding to (0.8).
print("Recall:", recall_score(model.predict(review_vector), y_true, pos_label='positive'))

Precision: 1.0
Recall: 0.75


In [None]:
#Evaluating the overall acuracy of the model.
#Accuracy can be increased with data size.
y_pred = model.predict(X_test)
print("Acuracy:", accuracy_score(y_test,y_pred))

Acuracy: 0.5
