In [1]:
import pandas as pd
import scipy as sp
import numpy as np

In [2]:
# Download data:
# 1000 pos, 1000 neg:
# http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
#
# 25000 pos, 25000 neg:
# http://ai.stanford.edu/~amaas/data/sentiment/

In [3]:
from string import punctuation
from os import listdir
from collections import Counter

In [4]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [5]:
# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [6]:
# save list to file
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [7]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

In [8]:
# load all docs in a directory
def process_docs(directory, vocab):
    lines = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip files that do not have the right extension
        if not filename.endswith(".txt"):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines

In [9]:
# load vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# prepare negative reviews
negative_lines = process_docs('review_polarity/txt_sentoken/neg', vocab)
save_list(negative_lines, 'negative.txt')
# prepare positive reviews
positive_lines = process_docs('review_polarity/txt_sentoken/pos', vocab)
save_list(positive_lines, 'positive.txt')

In [10]:
print("negative_lines:   %0.0f" % len(negative_lines))
print("positive_lines:   %0.0f" % len(positive_lines))

negative_lines:   1000
positive_lines:   1000


In [11]:
positive_lines[4]

'good will hunting is two movies in one an independent take on the struggle of four boston pals and traditional hollywood prodigy child film complete with sporadically moving situations and plenty plenty of shtick unusually directed by gus van sant good will hunting the of its story by the of fresh new talent the film stars matt damon as will hunting as mathematical rebellious whiz kid discovered by college professor stellan skarsgard who places him under psychological supervision with robin williams in nutshell thats it the core of the good will hunting is damon who the script co written by chasing amys ben affleck with just the right amount of warmth sensitivity and humanity to his position as refreshing talented performer but its the acting that hits the mark and damon hits all the right notes flying over robin williams role awakenings was written all over this as devastated shrink who has closed all contact with society due to his wifes tragic death damon effortlessly blends the of

In [12]:
# put positive and negative reviews into a Pandas DataFrame
positive_reviews = pd.DataFrame(positive_lines)
positive_reviews['sentiment'] = 'positive'
negative_reviews = pd.DataFrame(negative_lines)
negative_reviews['sentiment'] = 'negative'

In [13]:
# concat reviews and add positive or negative label
all_reviews = pd.concat([positive_reviews, negative_reviews])
all_reviews.columns = ['review', 'sentiment']
all_reviews['id'] = range(1, len(all_reviews) + 1)
all_reviews.set_index(all_reviews.id, inplace=True)
all_reviews['sentiment_binary'] = np.where(all_reviews['sentiment']=='positive', 1, 0)
all_reviews.head()

Unnamed: 0_level_0,review,sentiment,id,sentiment_binary
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,assume nothing the phrase is perhaps one of th...,positive,1,1
2,plot derek zoolander is male model he is also ...,positive,2,1
3,actually am fan of the original or so flick of...,positive,3,1
4,movie thats been as highly built up as the tru...,positive,4,1
5,good will hunting is two movies in one an inde...,positive,5,1


In [14]:
# transform all reviews into tfidf vectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english').fit(all_reviews.review)
_tf = vectorizer.transform(all_reviews.review)

In [15]:
# I like Pandas DataFrames
tf = pd.DataFrame(_tf.todense())
tf.set_index(all_reviews.id, inplace=True)
tf.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14620,14621,14622,14623,14624,14625,14626,14627,14628,14629
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.420118,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# SVM classifier
from sklearn.svm import LinearSVC
clf = LinearSVC(dual=False, tol=1e-3)

In [22]:
# split in training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tf, all_reviews.sentiment_binary, test_size=0.2) #, random_state=42)

In [23]:
# fit classifier
clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)

In [24]:
# make predictions on test data
pred = clf.predict(X_test)

In [25]:
# check accuracy of predictions
from sklearn import metrics
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.820
