In [2]:
import pyprind
import pandas as pd
import os

basepath = './aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:03:30


In [10]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)

In [11]:
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,at a Saturday matinee in my home town. I went ...,0
1,I love this movie. It is the first film Master...,1
2,"In the voice over which begins the film, Hughi...",1


# Bag-of-Words Model

In [12]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
        'The sun is shining.',
        'The weather is sweet.',
        'The sun is shining and the weather is sweet.'])
bag = count.fit_transform(docs)

In [14]:
print(count.vocabulary_)

{'is': 1, 'and': 0, 'the': 5, 'sun': 3, 'weather': 6, 'sweet': 4, 'shining': 2}


In [15]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [16]:
# tf-idf: term frequency-inverse document frequency
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


# Data Cleaning

In [18]:
df.loc[0, 'review'][-50:]

"ppies only...or if you're stoned. I give this a 1."

In [19]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [20]:
preprocessor(df.loc[0, 'review'][-50:])

'ppies only or if you re stoned i give this a 1 '

In [21]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [22]:
df['review'] = df['review'].apply(preprocessor)

# Stemming