### obtaining the movie review dataset

#### http://ai.stanford.edu/~amaas/data/sentiment/

In [1]:
import pandas as pd
import os

In [19]:
basepath = 'D:\pythonducat\Project\sentiment_analysis\sentiment'

In [20]:
labels = {'pos':1,'neg':0}

In [21]:
df = pd.DataFrame()

In [22]:
for s in ('test','train'):
    for l in ('pos','neg'):
        path = os.path.join(basepath,s,l)
        
        for fil in os.listdir(path):
            
            with open(os.path.join(path,fil),'r') as f:
                txt = f.read()
            df = df.append([[txt,labels[l]]],ignore_index=True)
            
df.columns=['review','sentiment']

In [23]:
df.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,My boyfriend and I went to watch The Guardian....,1
2,My yardstick for measuring a movie's watch-abi...,1
3,How many movies are there that you can think o...,1
4,This movie was sadly under-promoted but proved...,1


In [24]:
df.sentiment.value_counts()

1    20
0    20
Name: sentiment, dtype: int64

In [25]:
import numpy as np

In [26]:
from sklearn.utils import shuffle

In [27]:
df = shuffle(df)

In [28]:
df.head()

Unnamed: 0,review,sentiment
11,This is a pale imitation of 'Officer and a Gen...,0
27,Although I didn't like Stanley & Iris tremendo...,1
28,Very good drama although it appeared to have a...,1
36,"From the beginning of the movie, it gives the ...",0
6,I was fortunate enough to see this movie on pr...,1


In [29]:
df.to_csv('movie_reviews.csv',index=False)

In [30]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

In [31]:
stop = stopwords.words('english')

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  Searched in:
    - 'C:\\Users\\Rinkoo/nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:\\anaconda3\\nltk_data'
    - 'C:\\anaconda3\\share\\nltk_data'
    - 'C:\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Rinkoo\\AppData\\Roaming\\nltk_data'
**********************************************************************


In [2]:
df_new = pd.read_csv('movie_reviews.csv')

In [3]:
df_new.head()

Unnamed: 0,review,sentiment
0,"""Bon Voyage"" has the fast pace that in some wa...",1
1,"I'm sorry, but this may have been scary in 197...",0
2,Out of all the Mafia movies i have ever seen t...,1
3,Just watched this after my mother brought it b...,1
4,"Bad, bad, movie, so bad it is worth watching. ...",0


In [30]:
df_new.loc[0,'review'][-50:]

' scene. Good advice.<br /><br />I gave it 9 of 10.'

In [31]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)',text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

# \W =matches any non-alphanumeric character;
# \D = matches any non-digit character
#<[^>]*> :all tag  ex..<br />, <a>
# [^>] :except '>' 

In [32]:
preprocessor("</a>This is a test :-)!</a>")

'this is a test :'

In [33]:
df_new['review'] = df_new['review'].apply(preprocessor)

In [34]:
X_train = df_new.loc[:2500, 'review'].values
y_train = df_new.loc[:2500, 'sentiment'].values
X_test = df_new.loc[2500:5000, 'review'].values
y_test = df_new.loc[2500:5000, 'sentiment'].values

In [35]:
print np.bincount(y_test)
print np.unique(y_test)

[1226 1275]
[0 1]


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [36]:
tfidf = TfidfVectorizer(stop_words='english')

param_grid = {'clf__C': [1.0, 10.0, 100.0]}

In [37]:
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression())])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',)

#lr_tfidf.steps

In [38]:
gs_lr_tfidf.fit(X_train, y_train)
#print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
#print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__C': [1.0, 10.0, 100.0]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='accuracy',
       verbose=0)

In [39]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 100.0} 
CV Accuracy: 0.847


In [40]:
clf = gs_lr_tfidf.best_estimator_

In [41]:
clf.score(X_test,y_test)

0.85405837664934026

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [43]:
nb = Pipeline([('vect', tfidf),
               ('clf', MultinomialNB())])


In [44]:
nb.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [45]:
nb.score(X_train,y_train)

0.97600959616153538

In [46]:
nb.score(X_test,y_test)

0.83086765293882447

In [23]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [24]:
#cv = CountVectorizer(stop_words='english')
cv = TfidfVectorizer(stop_words='english')

In [25]:
new_data = cv.fit_transform(X_train)
new_test = cv.transform(X_test)

In [26]:
new_data.shape

(2501, 28145)

In [27]:
nb = MultinomialNB()
nb.fit(new_data,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [28]:
nb.score(new_test,y_test)

0.83046781287485005

In [29]:
nb.score(new_data,y_train)

0.97560975609756095