## Applying Machine Learning To Sentiment Analysis

1. Cleaning and preparing text data
2. Building feature vectors from text documents
3. Training a machine learning model to classify positive and negative movie reviews
4. Working with large text datasets using out-of-core learning
5. Inferring topics from document collections for categorization

In [1]:
import pyprind
import pandas as pd
import numpy as np
import os

In [4]:
basepath = './Data/aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:03:24


In [5]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))

In [6]:
df.to_csv('movie_data.csv',index=False, encoding='utf-8')

In [7]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0


### Transforming words into Feature Vectors

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
        'The Arrow is green.'
        'The Flash is red'
        'Arrow is green and Flash is red'
    ])

In [17]:
bag = count.fit_transform(docs)

In [18]:
print(count.vocabulary_)

{u'and': 0, u'redarrow': 6, u'is': 4, u'flash': 2, u'green': 3, u'arrow': 1, u'the': 7, u'red': 5}


In [19]:
print(bag.toarray())

[[1 1 2 2 4 1 1 2]]


In [20]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
                         norm='l2',
                         smooth_idf=True)

In [21]:
np.set_printoptions(precision=2)

In [22]:
print(tfidf.fit_transform(count.fit_transform(docs)))

  (0, 7)	0.35355339059327373
  (0, 1)	0.17677669529663687
  (0, 4)	0.7071067811865475
  (0, 3)	0.35355339059327373
  (0, 2)	0.35355339059327373
  (0, 6)	0.17677669529663687
  (0, 0)	0.17677669529663687
  (0, 5)	0.17677669529663687


In [23]:
df.loc[0,'review'][-50:]

u'to Star Cinema!! Way to go, Jericho and Claudine!!'

In [24]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [25]:
preprocessor(df.loc[0,'review'][-50:])

u'to star cinema way to go jericho and claudine '

In [26]:
df['review'] = df['review'].apply(preprocessor)

In [30]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]


from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

['runner', u'like', u'run', u'run', 'lot']

In [31]:
Xtrain = df.loc[: 25000, 'review'].values
Xtest = df.loc[: 25000, 'sentiment'].values
ytrain = df.loc[25000: , 'review'].values
ytest = df.loc[25000: , 'sentiment'].values

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

In [None]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))