In [44]:
# Ignore warnings :
import warnings
warnings.filterwarnings('ignore')

# Importing helpful package to load and hadle our data
import pandas as pd
import numpy as np
import nltk

# Stopwords considered as noise in the text. Text may contain stop words such as is, am, are, this, a, an, the, etc.
nltk.download ('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

#  Transforms text to feature vectors that can be used as input to estimator.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pansh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
# Read the data set
data=pd.read_csv(r'sentiment_classification.txt', sep='\t', names=['review','text'] )
data.sample(5)

Unnamed: 0,review,text
4615,0,Da Vinci Code sucks.
6651,0,My dad's being stupid about brokeback mountain...
2458,1,I love Harry Potter.
6009,0,EVILPINKMUNKY IS GAY AND BROKEBACK MOUNTAIN SU...
743,1,The Da Vinci Code was absolutely AWESOME!


Here 0 means -Ve and 1 means +ve sentiment

In [47]:
# Get row and column of data
data.shape

(6918, 2)

In [48]:
stopset = set(stopwords.words('english'))
vectorize = TfidfVectorizer(use_idf=True,lowercase=True,strip_accents='ascii',stop_words=stopset)

In [49]:
# Take out dependent varaible(Column - review)
y = data['review']
y.shape

(6918,)

In [50]:
# convert df['text'] to feature and  Learn vocabulary and idf, return term-document matrix.
X = vectorize.fit_transform(data.text)
X.shape

(6918, 2011)

In [51]:
# Split train and test data set
X_train,X_test,y_train,y_test = train_test_split(X, y,random_state = 23,test_size=0.3)

In [52]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [53]:
# Get accuracy score
roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

0.998825847956749

In [54]:
# Read the test data set file
test_data;
with open(r'sentiment_test.txt',encoding="utf8") as file:
       test_data = file.read()      

In [55]:
# take out the sentence from paragraph
sentences = nltk.sent_tokenize(test_data)
# print(sentences)

In [56]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [57]:
# new sentence array for lemmatized sentences
new_sentence = []

In [58]:
# Removing stopwords
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    new_sentence.append(' '.join(words))

In [59]:
# Randomly picking sentence to check its sentiment
print(new_sentence[10])

I think Angelina Jolie much beautiful Jennifer Anniston , , way , majorly OVERRATED .


In [60]:
test_data_vector = vectorize.transform(new_sentence)

In [61]:
# test_data_vector[10] is negative sentence so , we should get 0 
print(clf.predict(test_data_vector[10]))

[0]
