# Sentiment Analysis with scikit-learn

In [96]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
from nltk.stem.porter import PorterStemmer
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.linear_model import LogisticRegressionCV

print('Modules are imported')


Modules are imported


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Loading the dataset


In [83]:
#df
movie_data = pd.read_csv("movie_data.csv")
movie_data.head()


Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


## Transforming Documents into Feature Vectors


In [60]:
count = CountVectorizer()
docs = np.array([movie_data["review"][1]])

bag = count.fit_transform(docs)

In [61]:
print((count.vocabulary_))

{'ok': 83, 'so': 103, 'really': 94, 'like': 62, 'kris': 59, 'kristofferson': 60, 'and': 5, 'his': 48, 'usual': 125, 'easy': 27, 'going': 38, 'delivery': 21, 'of': 82, 'lines': 63, 'in': 53, 'movies': 76, 'age': 3, 'has': 44, 'helped': 46, 'him': 47, 'with': 136, 'soft': 104, 'spoken': 107, 'low': 66, 'energy': 31, 'style': 110, 'he': 45, 'will': 135, 'steal': 108, 'scene': 99, 'effortlessly': 28, 'but': 12, 'disappearance': 23, 'is': 56, 'misstep': 71, 'holy': 50, 'moly': 72, 'this': 118, 'was': 129, 'bad': 7, 'movie': 75, 'br': 9, 'must': 77, 'give': 37, 'kudos': 61, 'to': 121, 'the': 116, 'cinematography': 16, 'actors': 1, 'including': 54, 'for': 34, 'trying': 122, 'their': 117, 'darndest': 20, 'make': 68, 'sense': 102, 'from': 35, 'goofy': 41, 'confusing': 19, 'story': 109, 'none': 80, 'it': 57, 'made': 67, 'probably': 92, 'didn': 22, 'understand': 123, 'either': 29, 'just': 58, 'through': 119, 'motions': 74, 'hoping': 51, 'someone': 106, 'would': 137, 'come': 17, 'up': 124, 'tell':

In [62]:
print(bag.toarray())

[[1 1 1 1 1 9 2 1 1 6 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 5 1
  1 1 2 1 1 1 1 2 1 2 1 3 4 1 1 1 1 1 1 1 2 4 1 3 1 1 1 1 1 1 2 1 1 2 1 1
  1 1 1 3 2 1 3 1 2 1 8 1 2 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
  1 1 1 1 1 1 1 3 7 1 7 2 2 3 1 1 2 1 1 1 2 4 3 1 1 1 1 1 1 1 1 1 1 1]]


## Calculate the TF-IDF

In [63]:
tfidf = TfidfTransformer(use_idf = True,
                         norm = 'l2',
                         smooth_idf= True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(bag).toarray())

[[0.04 0.04 0.04 0.04 0.04 0.37 0.08 0.04 0.04 0.25 0.04 0.04 0.04 0.04
  0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.08 0.04 0.04
  0.04 0.04 0.04 0.04 0.04 0.04 0.2  0.04 0.04 0.04 0.08 0.04 0.04 0.04
  0.04 0.08 0.04 0.08 0.04 0.12 0.16 0.04 0.04 0.04 0.04 0.04 0.04 0.04
  0.08 0.16 0.04 0.12 0.04 0.04 0.04 0.04 0.04 0.04 0.08 0.04 0.04 0.08
  0.04 0.04 0.04 0.04 0.04 0.12 0.08 0.04 0.12 0.04 0.08 0.04 0.33 0.04
  0.08 0.04 0.12 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04
  0.04 0.04 0.04 0.04 0.08 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04
  0.04 0.04 0.04 0.12 0.29 0.04 0.29 0.08 0.08 0.12 0.04 0.04 0.08 0.04
  0.04 0.04 0.08 0.16 0.12 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04
  0.04 0.04]]


## Data Preparation

In [65]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticon = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', " ", text.lower()) +\
    " ".join(emoticon).replace('-', '')
    return text

In [66]:
preprocessor(movie_data["review"][1])

'ok so i really like kris kristofferson and his usual easy going delivery of lines in his movies age has helped him with his soft spoken low energy style and he will steal a scene effortlessly but disappearance is his misstep holy moly this was a bad movie i must give kudos to the cinematography and and the actors including kris for trying their darndest to make sense from this goofy confusing story none of it made sense and kris probably didn t understand it either and he was just going through the motions hoping someone would come up to him and tell him what it was all about i don t care that everyone on this movie was doing out of love for the project or some such nonsense i ve seen low budget movies that had a plot for goodness sake this had none zilcho nada zippo empty of reason a complete waste of good talent scenery and celluloid i rented this piece of garbage for a buck and i want my money back i want my 2 hours back i invested on this grade f waste of my time don t watch this 

In [67]:
preprocessor("blah blah :-) :( :-| blah </p>")

'blah blah blah :) :('

In [68]:
movie_data = movie_data["review"].apply(preprocessor)

In [69]:
## okenisation of textt

In [70]:
porter = PorterStemmer()

In [71]:
def tokenizer(text):
    return text.split()


In [72]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [78]:
tokenizer("I see seashells at a sea which has a shape c")

['I', 'see', 'seashells', 'at', 'a', 'sea', 'which', 'has', 'a', 'shape', 'c']

In [77]:
tokenizer_porter("I see seashells at a sea which has a shape c")

['I', 'see', 'seashel', 'at', 'a', 'sea', 'which', 'ha', 'a', 'shape', 'c']

In [79]:
stop = stopwords.words('english')
[w for w in tokenizer_porter("I see seashells at a sea which has a shape c")if w not in stop]

['I', 'see', 'seashel', 'sea', 'ha', 'shape', 'c']

## Transform text into data

In [107]:
tfidf = TfidfVectorizer(strip_accents= None,
                       lowercase= None,
                       tokenizer=tokenizer_porter,
                       use_idf=True,
                       norm = 'l2',
                       smooth_idf=True)
y = movie_data["sentiment"].values
x = tfidf.fit_transform(movie_data["review"])

In [108]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size = 0.5, shuffle = 0)

In [111]:
clf = LogisticRegressionCV(cv = 5,
                          scoring = 'accuracy',
                          random_state = 0,
                          n_jobs = -1,
                          verbose = 3, max_iter = 300).fit(x_train, y_train)



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  5.6min remaining:  8.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.0min finished


In [117]:
saved_model = open('saved.sav', 'wb')
pickle.dump(clf, saved_model)
saved_model.close()

## Model Evaluation

In [118]:
filename = 'saved_model.sav'
saved_clf = pickle.load(open(filename, 'rb'))

In [120]:
acc = saved_clf.score(x_test, y_test)
acc

0.89208

In [123]:
print("Accuracy of the model is ", acc * 100, "%", sep="")

Accuracy of the model is 89.208%
