### Movie Review Sentiment Analysis Using TF-IDF
We use term frequency-inverse document frequency (TF-IDF) to weigh words and build a vocabulary for use as a vector.

#### Import statements

In [1]:
import sys
import os

from time import time
# Use scikit-learn to do the vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
# SVM as the classifier
from sklearn import svm
# Just for reporting purposes
from sklearn.metrics import classification_report

#### Data read

In [2]:
# Folder with movie review files
data_dir = "/Users/<>/Documents/Data/review_polarity/txt_sentoken"
# Class names for classification
classes = ['pos', 'neg']

# Read the data from the path
train_data = []
train_labels = []
test_data = []
test_labels = []
for curr_class in classes:
    dirname = os.path.join(data_dir, curr_class)
    for fname in os.listdir(dirname):
        with open(os.path.join(dirname, fname), 'r') as f:
            content = f.read()
            # File names are of the form cvxxx_xxxxx.txt
            # Use files that start with the cv9xx_xxxxx.txt for test
            if fname.startswith('cv9'):
                test_data.append(content)
                test_labels.append(curr_class)
            else:
                train_data.append(content)
                train_labels.append(curr_class)

In [3]:
# Let's see how many samples are available and view some data
print("**" * 30)
print("# of train data samples:\t%d\n# of test  data samples:\t%d" % 
      (len(train_data), len(test_data)))
print("**" * 30)

idx = 150 # Some random index between 0, 199
print("Train review: [%s] with sentiment: [%s]" % (train_data[idx][:100], train_labels[idx]))
print("Test review: [%s] with sentiment: [%s]" % (test_data[idx][:100], test_labels[idx]))

************************************************************
# of train data samples:	1800
# of test  data samples:	200
************************************************************
Train review: [in recent years , harrison ford has been such a grave screen presence , scowling through the likes o] with sentiment: [pos]
Test review: [the most interesting part of " can't hardly wait " just happens to be not only the most human , but ] with sentiment: [neg]


#### Build vocabulary and vectorize the data

In [4]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,      # Ignore terms that have a document frequency strictly lower than the given threshold
                             max_df = 0.8,  # Ignore terms that have a document frequency strictly higher than the given threshold
                             sublinear_tf=True, 
                             use_idf=True)  # Enable inverse-document-frequency reweighting

# Learn vocabulary and idf, return term-document matrix
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

In [5]:
print("Vocabulary length: %d" % len(vectorizer.vocabulary_))

Vocabulary length: 12495


#### Train a linear SVM classifier 

In [6]:
# Perform classification with SVM with a linear kernel
classifier = svm.SVC(kernel='linear')
t0 = time()
classifier.fit(train_vectors, train_labels)
t1 = time()
prediction = classifier.predict(test_vectors)
t2 = time()
time_train = t1 - t0
time_predict = t2 - t1

#### Check classification performance

In [7]:
# Print classification results as a report
print("Classification Results")
print("Training time: %.3fs; Prediction time: %.3fs" % (time_train, time_predict))
print(classification_report(test_labels, prediction))

Classification Results
Training time: 7.215s; Prediction time: 0.679s
             precision    recall  f1-score   support

        neg       0.91      0.92      0.92       100
        pos       0.92      0.91      0.91       100

avg / total       0.92      0.92      0.91       200

