In [1]:
# import required libraries
import pandas as pd


In [2]:
# get the sentiment analysis dataset
df_sentiment = pd.read_csv("imdb_labelled.txt", sep="\t", names=["comment", "label"])

In [3]:
# View the first 10 observations. 1 for positivesentiment and 0 for negative sentiment
df_sentiment.head(10)

Unnamed: 0,comment,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
5,"The rest of the movie lacks art, charm, meanin...",0
6,Wasted two hours.,0
7,Saw the movie today and thought it was a good ...,1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the scien...,1


In [4]:
# view more about the imdb dataset using the describe function
df_sentiment.describe()

Unnamed: 0,label
count,748.0
mean,0.516043
std,0.500077
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [5]:
# more info on the data
df_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  748 non-null    object
 1   label    748 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


In [6]:
# view the data using group by and describe method
df_sentiment.groupby("label").describe()

Unnamed: 0_level_0,comment,comment,comment,comment
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,10/10,2


In [8]:
# verify length of the messages and also add it as a new feature/column
df_sentiment['length'] = df_sentiment['comment'].apply(len)

In [9]:
df_sentiment.head()

Unnamed: 0,comment,label,length
0,"A very, very, very slow-moving, aimless movie ...",0,87
1,Not sure who was more lost - the flat characte...,0,99
2,Attempting artiness with black & white and cle...,0,188
3,Very little music or anything to speak of.,0,44
4,The best scene in the movie was when Gerardo i...,1,108


In [10]:
# view first
df_sentiment[df_sentiment['length'] > 50]['comment'].iloc[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [11]:
# start text processing with vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()


In [12]:
# define a function to remive the stop words from the comments
import string
from nltk.corpus import stopwords
def message_text_process(message):
    # check characters to see if there are ounctuations
    no_punctuation = [char for char in message if char not in string.punctuation]
    
    # now form the sentence
    no_punctuation = ''.join(no_punctuation)
    
    # now eliminate any stopwords
    return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [13]:
# bag of words by applying the function and fitting the data(comment) into it
bag_of_words = CountVectorizer(analyzer=message_text_process).fit(df_sentiment["comment"])

In [14]:
# apply transform method on the bag of words
comment_bagofwords = bag_of_words.transform(df_sentiment["comment"])

In [16]:
# apply tfidf transformer and fit the bag of words into it
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(comment_bagofwords)

In [20]:
# print the transformer shape
comment_tfidf = tfidf_transformer.transform(comment_bagofwords)
print(comment_tfidf.shape)

(748, 3257)


In [21]:
# use naive bayes to detect sentiment and fit the tfidf data onto it
from sklearn.naive_bayes import MultinomialNB
sentiment_detection_model = MultinomialNB().fit(comment_tfidf, df_sentiment['label'])

In [24]:
#check the model for the predicted and expected value for lets say comment 1 and comment 5
comment = df_sentiment['comment'][4]
bag_of_words_for_comment = bag_of_words.transform([comment])
tfidf = tfidf_transformer.transform(bag_of_words_for_comment)

print("Predicted Value: ", sentiment_detection_model.predict(tfidf)[0])
print("Expected Value: ", df_sentiment['label'][4])

Predicted Value:  1
Expected Value:  1
