In [37]:
import nltk
import numpy as np
import pandas as pd
import re
import warnings
# text preprocessing modules
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# text preprocessing modules
from string import punctuation


In [38]:
warnings.filterwarnings("ignore")
np.random.seed(123)

In [44]:
# Download dependancy
for dependancy in ("brown", "names", "wordnet", "averaged_perceptron_tagger", "universal_tagset", "stopwords"):
    nltk.download(dependancy)

[nltk_data] Downloading package brown to /home/hari/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to /home/hari/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hari/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/hari/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to /home/hari/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [40]:
data = pd.read_csv("../labeledTrainData.tsv", sep="\t")

In [41]:
# Data analysis
(
    data.shape,
    data.head(),
    data.isnull().sum()
)

((25000, 3),
        id  sentiment                                             review
 0  5814_8          1  With all this stuff going down at the moment w...
 1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
 2  7759_3          0  The film starts with a manager (Nicholas Bell)...
 3  3630_4          0  It must be assumed that those who praised this...
 4  9495_8          1  Superbly trashy and wondrously unpretentious 8...,
 id           0
 sentiment    0
 review       0
 dtype: int64)

In [42]:
# Class distribution
data.sentiment.value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [62]:
review_1 = data.loc[0]['review']

In [144]:
# No need
review_1 = "With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally starts is only on for 20 minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord. Why he wants MJ dead so bad is beyond me. Because MJ overheard his plans? Nah, Joe Pesci's character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno, maybe he just hates MJ's music.<br /><br />Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence. Also, the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene.<br /><br />Bottom line, this movie is for people who like MJ on one level or another (which i think is most people). If not, then stay away. It does try and give off a wholesome message and ironically MJ's bestest buddy in this movie is a girl! Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty? Well, with all the attention i've gave this subject....hmmm well i don't know because people can be different behind closed doors, i know this for a fact. He is either an extremely nice but stupid guy or one of the most sickest liars. I hope he is not the latter. 99djs9 https://towardsdatascience.com/how-to-build-and-deploy-an-nlp-model-with-fastapi-part-1-9c1c7030d40"

In [141]:
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    """Clean the text, with option to remove stop_words and to lemmatize words.

    Args:
        text (string): text to b cleaned
        remove_stop_words (bool, optional): _description_. Defaults to True.
        lemmatize_words (bool, optional): _description_. Defaults to True.

    Returns:
        string: cleaned text
    """
    stop_words =  stopwords.words('english')
    # Clean the text
    # text = re.sub(r"<br /><br />", " ", text)
    text = re.sub(r"http\S+", ' link ', text)
    text = re.sub(r"\'s", " ", text) # This wont match since ' is omiited previously
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    # Remove punctuation
    text = ''.join(c for c in text if c not in punctuation) # Needed ?
    # Optionally remove stop_words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if w not in stop_words]
        text = ' '.join(text)
    # Optionally shorten words to there stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    return text

In [145]:
# No need
text_cleaning(review_1)

'With stuff going moment MJ started listening music watching odd documentary watched The Wiz watched Moonwalker Maybe want get certain insight guy thought really cool eighty maybe make mind whether guilty innocent Moonwalker part biography part feature film remember going see cinema originally released Some subtle message MJ feeling towards press also obvious message drug bad kay br br Visually impressive course Michael Jackson unless remotely like MJ anyway going hate find boring Some may call MJ egotist consenting making movie BUT MJ fan would say made fan true really nice br br The actual feature film bit finally start minute excluding Smooth Criminal sequence Joe Pesci convincing psychopathic powerful drug lord Why want MJ dead bad beyond Because MJ overheard plan Nah Joe Pesci character ranted wanted people know supplying drug etc dunno maybe hate MJ music br br Lots cool thing like MJ turning car robot whole Speed Demon sequence Also director must patience saint came filming kidd

In [152]:
# Cleaning review
data["cleaned_review"] = data["review"].apply(text_cleaning)

In [153]:
data.head()

Unnamed: 0,id,sentiment,review,cleaned_review
0,5814_8,1,With all this stuff going down at the moment w...,With stuff going moment MJ started listening m...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",The Classic War Worlds Timothy Hines entertain...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,The film start manager Nicholas Bell giving we...
3,3630_4,0,It must be assumed that those who praised this...,It must assumed praised film greatest filmed o...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,Superbly trashy wondrously unpretentious explo...


In [161]:
# Feature-Target split
X = data["cleaned_review"]
y = data.sentiment.values

In [163]:
# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=42, shuffle=True, stratify=y)

In [177]:
y = np.array([0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1])
count_0, count_1 = 0, 0
for i in y_train:
    if i == 0:
        count_0 += 1
    if i == 1:
        count_1 += 1
count_0, count_1

(10625, 10625)

In [200]:
# Create a classifier in pipeline
sentiment_classifier = Pipeline(
    steps=[
        ("pre_processing", TfidfVectorizer(lowercase=False)),
        ("naive_bayes", MultinomialNB())
    ],
    verbose=True
)

In [201]:
# train the sentiment classifier 
sentiment_classifier.fit(X_train, y_train)

[Pipeline] .... (step 1 of 2) Processing pre_processing, total=   1.9s
[Pipeline] ....... (step 2 of 2) Processing naive_bayes, total=   0.0s


In [202]:
# Test model prediction
y_pred = sentiment_classifier.predict(X_test)
# Accuracy
accuracy_score(y_test, y_pred)

0.8626666666666667

In [205]:
import joblib
joblib.dump(sentiment_classifier, "../models/sentiment_model_pipeline.pkl")

['../models/sentiment_model_pipeline.pkl']

In [229]:
sentiment_classifier.predict([" is better"])

array([0])