In [130]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix,classification_report
from bs4 import BeautifulSoup
import re
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import stop_words
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import stem


In [51]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [52]:
df.shape

(50000, 2)

In [68]:
# pre-processing of text
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the unwanted text
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    return text
#Apply function on review column
df['review']=df['review'].apply(denoise_text)
df['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production The filming tech...
2        I thought this was a wonderful way to spend ti...
3        Basically theres a family where a little boy J...
4        Petter Matteis Love in the Time of Money is a ...
5        Probably my alltime favorite movie a story of ...
6        I sure would like to see a resurrection of a u...
7        This show was an amazing fresh  innovative ide...
8        Encouraged by the positive comments about this...
9        If you like original gut wrenching laughter yo...
10       Phil the Alien is one of those quirky films wh...
11       I saw this movie when I was about 12 when it c...
12       So im not a big fan of Bolls work but then aga...
13       The cast played ShakespeareShakespeare lostI a...
14       This a fantastic movie of three prisoners who ...
15       Kind of drawn in by the erotic scenes only to ...
16       Some films just simply should not be remade Th.

In [95]:
train = df.review
test = df.sentiment
test.head()

0    positive
1    positive
2    positive
3    negative
4    positive
Name: sentiment, dtype: object

In [96]:
test = pd.get_dummies(test)
test = test.drop(["negative"],axis=1)
test= test.rename(columns = {"positive":"sentiment"})
test.head()

Unnamed: 0,sentiment
0,1
1,1
2,1
3,0
4,1


In [97]:
test.head()

Unnamed: 0,sentiment
0,1
1,1
2,1
3,0
4,1


In [56]:
print(train)


                                                  review
0      One of the other reviewers has mentioned that ...
1      A wonderful little production. <br /><br />The...
2      I thought this was a wonderful way to spend ti...
3      Basically there's a family where a little boy ...
4      Petter Mattei's "Love in the Time of Money" is...
5      Probably my all-time favorite movie, a story o...
6      I sure would like to see a resurrection of a u...
7      This show was an amazing, fresh & innovative i...
8      Encouraged by the positive comments about this...
9      If you like original gut wrenching laughter yo...
10     Phil the Alien is one of those quirky films wh...
11     I saw this movie when I was about 12 when it c...
12     So im not a big fan of Boll's work but then ag...
13     The cast played Shakespeare.<br /><br />Shakes...
14     This a fantastic movie of three prisoners who ...
15     Kind of drawn in by the erotic scenes, only to...
16     Some films just simply s

In [185]:
X_train, X_test, y_train, y_test = train_test_split(train, test, test_size=0.20, random_state=42,stratify=test)
X_train.shape

(40000,)

In [186]:
cv_model = CountVectorizer()
cv_model.fit(X_train[:5])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [187]:
len(cv_model.get_feature_names())

485

In [188]:
pipe = make_pipeline(CountVectorizer(),LinearSVC())
pipe.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [189]:
pipe.score(X_test,y_test)

0.878

In [190]:
cv_model.fit(X_train)
words = cv_model.get_feature_names()
len(words)

192004

In [191]:
sk_stopwords = stop_words
nltk_stopwords = stopwords

In [192]:
cv_model = CountVectorizer(stop_words = sk_stopwords.ENGLISH_STOP_WORDS)
cv_model.fit(X_train)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=frozenset({'be', 'four', 'six', 'them', 'ever', 'up', 'its', 'noone', 'you', 'fifteen', 'must', 'first', 'yourselves', 'ten', 'back', 'etc', 'hundred', 'the', 'an', 'myself', 'me', 'where', 'per', 'ltd', 'mine', 'until', 'not', 'but', 'himself', 'otherwise', 'forty', 'eight', 'meanwhile',...ow', 'elsewhere', 'else', 'is', 'only', 'that', 'ie', 'there', 'cannot', 'from', 'anyhow', 'hasnt'}),
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [193]:
len(cv_model.get_feature_names())

191692

In [194]:
pipe = make_pipeline(CountVectorizer(stop_words = sk_stopwords.ENGLISH_STOP_WORDS),LinearSVC())
pipe.fit(X_train,y_train)
pipe.score(X_test,y_test)

0.8701

In [195]:
cv_model = CountVectorizer(stop_words = sk_stopwords.ENGLISH_STOP_WORDS,min_df=2)
cv_model.fit(X_train)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=frozenset({'be', 'four', 'six', 'them', 'ever', 'up', 'its', 'noone', 'you', 'fifteen', 'must', 'first', 'yourselves', 'ten', 'back', 'etc', 'hundred', 'the', 'an', 'myself', 'me', 'where', 'per', 'ltd', 'mine', 'until', 'not', 'but', 'himself', 'otherwise', 'forty', 'eight', 'meanwhile',...ow', 'elsewhere', 'else', 'is', 'only', 'that', 'ie', 'there', 'cannot', 'from', 'anyhow', 'hasnt'}),
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [196]:
len(cv_model.get_feature_names())

70284

In [197]:
pipe = make_pipeline(CountVectorizer(stop_words = sk_stopwords.ENGLISH_STOP_WORDS,min_df=2),LinearSVC())
pipe.fit(X_train,y_train)
pipe.score(X_test,y_test)

0.8686

In [198]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = stem.WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [199]:
cv_model = CountVectorizer(tokenizer=LemmaTokenizer(),stop_words=sk_stopwords.ENGLISH_STOP_WORDS,min_df=2)
cv_model.fit(X_train)

  'stop_words.' % sorted(inconsistent))


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=frozenset({'be', 'four', 'six', 'them', 'ever', 'up', 'its', 'noone', 'you', 'fifteen', 'must', 'first', 'yourselves', 'ten', 'back', 'etc', 'hundred', 'the', 'an', 'myself', 'me', 'where', 'per', 'ltd', 'mine', 'until', 'not', 'but', 'himself', 'otherwise', 'forty', 'eight', 'meanwhile',...ow', 'elsewhere', 'else', 'is', 'only', 'that', 'ie', 'there', 'cannot', 'from', 'anyhow', 'hasnt'}),
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.LemmaTokenizer object at 0x000001E4C594E8D0>,
        vocabulary=None)

In [200]:
len(cv_model.get_feature_names())

63043

In [201]:
# CountVectorizer with linearsvc,tokenizer
pipe = make_pipeline(CountVectorizer(tokenizer=LemmaTokenizer(),stop_words=sk_stopwords.ENGLISH_STOP_WORDS,min_df=2),LinearSVC())
pipe.fit(X_train,y_train)
pipe.score(X_test,y_test)

0.8674

In [202]:
tf_pipe = make_pipeline(TfidfVectorizer(min_df=2),LinearSVC())
tf_pipe.fit(X_train,y_train)
tf_pipe.score(X_test,y_test)

0.9002

In [203]:
# with tokenizer,tfidf and naive bayes  
tf_pipe = make_pipeline(TfidfVectorizer(tokenizer = LemmaTokenizer(),min_df=2,stop_words = sk_stopwords.ENGLISH_STOP_WORDS),
                        MultinomialNB())
tf_pipe.fit(X_train,y_train)
tf_pipe.score(X_test,y_test)

0.8616

In [204]:
# removed tokenizer and added ngram

tf_pipe = make_pipeline(TfidfVectorizer(ngram_range=(1,3),min_df=2,stop_words = sk_stopwords.ENGLISH_STOP_WORDS),
                        MultinomialNB())
tf_pipe.fit(X_train,y_train)
tf_pipe.score(X_test,y_test)

  y = column_or_1d(y, warn=True)


0.8842

In [205]:
# tfid with linearsvc with c=1(best)

tf_pipe = make_pipeline(TfidfVectorizer(min_df=2,stop_words = sk_stopwords.ENGLISH_STOP_WORDS),LinearSVC(C=1)                       )
tf_pipe.fit(X_train,y_train)
tf_pipe.score(X_test,y_test)

  y = column_or_1d(y, warn=True)


0.8959

In [206]:
y_pred = pipe.predict(X_test)

In [207]:
print("CountVectorizer with Lemma Tokenizer,LinearSVC,min_df=2\n")
print(confusion_matrix(y_pred,y_test))
print(classification_report(y_pred,y_test))

CountVectorizer with Lemma Tokenizer,LinearSVC,min_df=2

[[4342  668]
 [ 658 4332]]
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      5010
           1       0.87      0.87      0.87      4990

   micro avg       0.87      0.87      0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [208]:
tfy_pred = tf_pipe.predict(X_test)

In [209]:
print("Tfidf with min_df=2,LinearSVC \n")
print(confusion_matrix(tfy_pred,y_test))
print(classification_report(tfy_pred,y_test))

Tfidf with min_df=2,LinearSVC 

[[4450  491]
 [ 550 4509]]
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      4941
           1       0.90      0.89      0.90      5059

   micro avg       0.90      0.90      0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



a
a
a
a
a
a
a
a
b
b
a
a
a
a
4
