In [4]:
#import necessary libraries
import pandas as pd

In [7]:
#get the sentiment dataset
df_sentiments =pd.read_csv('imdb_labelled.csv',sep='\t',skiprows=1,names=['comment','label'])

In [9]:
#view 10 observations
df_sentiments.head(10)

Unnamed: 0,comment,label
0,Not sure who was more lost - the flat characte...,0
1,Attempting artiness with black & white and cle...,0
2,Very little music or anything to speak of.,0
3,The best scene in the movie was when Gerardo i...,1
4,"The rest of the movie lacks art, charm, meanin...",0
5,Wasted two hours.,0
6,Saw the movie today and thought it was a good ...,1
7,A bit predictable.,0
8,Loved the casting of Jimmy Buffet as the scien...,1
9,And those baby owls were adorable.,1


In [10]:
#view more information about the data 
df_sentiments.describe()

Unnamed: 0,label
count,999.0
mean,0.500501
std,0.50025
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [11]:
#view more information on data
df_sentiments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
comment    999 non-null object
label      999 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


In [12]:
#view data using group by and describe method
df_sentiments.groupby('label').describe()


Unnamed: 0_level_0,comment,comment,comment,comment
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,499,498,Not recommended.,2
1,500,498,Definitely worth checking out.,2


In [13]:
#verify length of the messages and also add it as a new column(feature)
df_sentiments['length']=df_sentiments['comment'].apply(len)


In [14]:
#view first 5 messages with length
df_sentiments.head()

Unnamed: 0,comment,label,length
0,Not sure who was more lost - the flat characte...,0,97
1,Attempting artiness with black & white and cle...,0,186
2,Very little music or anything to speak of.,0,42
3,The best scene in the movie was when Gerardo i...,1,106
4,"The rest of the movie lacks art, charm, meanin...",0,112


In [15]:
#view first
df_sentiments[df_sentiments['length']>50]['comment'].iloc[0]

'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.'

In [25]:
#start processing with vectorizer
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
#define a function to get rid of stopwords present in the messages
def message_text_process(mess):
    no_punctuation=[char for char in mess if char not in string.punctuation]
    no_punctuation=''.join(no_punctuation)
    return[word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

df_sentiments['comment'].head(5).apply(message_text_process)

0    [sure, lost, flat, characters, audience, nearl...
1    [Attempting, artiness, black, white, clever, c...
2                     [little, music, anything, speak]
3    [best, scene, movie, Gerardo, trying, find, so...
4    [rest, movie, lacks, art, charm, meaning, empt...
Name: comment, dtype: object

In [30]:
#bag of words by applying the function and fit the data (comment) into it
bag_of_words_transformer=CountVectorizer(analyzer=message_text_process).fit(df_sentiments['comment'])
print (len(bag_of_words_transformer.vocabulary_))

3250


In [35]:
#apply transform method for the bag of words
message_bagofwords=bag_of_words_transformer.transform(df_sentiments['comment'])
#print (message_bagofwords)

In [39]:
#apply tfidf transformer and fit the bag of words into it
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer().fit(message_bagofwords)

In [40]:
#print shape of the tfidf
message_tfidf=tfidf_transformer.transform(message_bagofwords)
print(message_tfidf.shape)

(999, 3250)


In [43]:
#choose Naive Bayes to detect the sentiment and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
sentiment_detect_model = MultinomialNB().fit(message_tfidf,df_sentiments['label'])

In [48]:
#check model for the predicted and expected value say for comment#1 and comment#5
comment=df_sentiments['comment'][0]
comment_bagofwords=bag_of_words_transformer.transform([comment])
comment_tfidf=tfidf_transformer.transform(comment_bagofwords)
print('Predicted of comment 1:',sentiment_detect_model.predict(comment_tfidf) )
print('Expected of comment 1:',df_sentiments['label'][0])

comment=df_sentiments['comment'][4]
comment_bagofwords=bag_of_words_transformer.transform([comment])
comment_tfidf=tfidf_transformer.transform(comment_bagofwords)
print('Predicted of comment 5:',sentiment_detect_model.predict(comment_tfidf) )
print('Expected  of comment 5:',df_sentiments['label'][4])

Predicted of comment 1: [0]
Expected of comment 1: 0
Predicted of comment 5: [0]
Expected  of comment 5: 0
