## Loading Data and Preprocessing

In [1]:
import numpy as np
import os, sys
import pdb
import matplotlib
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer

TRAIN_PATH = '../aclImdb/train/'
TEST_PATH = '../aclImdb/test/'

# each data sample (review) is a document of words
# each review is either negative (0) or positive (1)
reviews_train = load_files(TRAIN_PATH)
data_train, targets_train = reviews_train.data, reviews_train.target
print("Number of reviews: {}".format(type(data_train))) # 25000
print("Samples per class (training): {}".format(np.bincount(targets_train)))

reviews_test = load_files(TEST_PATH)
data_test, targets_test = reviews_test.data, reviews_test.target
print("Number of reviews: {}".format(type(data_train))) # 25000
print("Samples per class (training): {}".format(np.bincount(targets_train)))

# remove line breakers
data_train = [review.replace(b"<br />", b" ") for review in data_train]
data_test = [review.replace(b"<br />", b" ") for review in data_test]

print("Loading Completed!")

Number of reviews: <type 'list'>
Samples per class (training): [12500 12500]
Number of reviews: <type 'list'>
Samples per class (training): [12500 12500]
Loading Completed!


## Vectorize reviews

In [2]:
vect = CountVectorizer()
vect.fit(data_train)
feature_names = vect.get_feature_names()
print("Size of vocabulary: {}".format(len(feature_names)))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))

X_train = vect.transform(data_train) # 25000 x 74849

Size of vocabulary: 74849
Features 20010 to 20030:
[u'dratted', u'draub', u'draught', u'draughts', u'draughtswoman', u'draw', u'drawback', u'drawbacks', u'drawer', u'drawers', u'drawing', u'drawings', u'drawl', u'drawled', u'drawling', u'drawn', u'draws', u'draza', u'dre', u'drea']


## Baseline (Logistic regresssion)

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(solver='lbfgs',max_iter=200), X_train, targets_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))



Mean cross-validation accuracy: 0.88


## Visualization

In [4]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pandas as pd
from PIL import Image

In [5]:
vect_fit = vect.fit_transform(data_train)

neg_include = targets_train == 0
pos_include = targets_train == 1
neg_word_freq = np.array(vect_fit[neg_include].sum(axis=0))[0]
pos_word_freq = np.array(vect_fit[pos_include].sum(axis=0))[0]
assert neg_include.sum() + pos_include.sum() == len(targets_train)

neg_sorted_names = [name for _, name in sorted(zip(neg_word_freq, feature_names))]
pos_sorted_names = [name for _, name in sorted(zip(pos_word_freq, feature_names))]
neg_sorted_freq = sorted(neg_word_freq)
pos_sorted_freq = sorted(pos_word_freq)

# Some wrongly spelled words in reviews
print(neg_sorted_names[20000:20010])
print(pos_sorted_names[20000:20010])

[u'whelan', u'whelming', u'whereever', u'whetted', u'whic', u'whick', u'whidbey', u'whig', u'whigham', u'whiile']
[u'anbuselvan', u'anchorpoint', u'anch\xeda', u'ancona', u'anda', u'andaaz', u'andalusia', u'anddd', u'andelou', u'andersen']


In [None]:
# make plot
# print(list(range(len(neg_sorted_freq[0:50]))))
fig, axs = plt.subplots(1,2)
axs[0].bar(list(range(len(neg_sorted_freq[0:50]))), neg_sorted_freq[0:50])
axs[0].set_xticks(list(range(len(neg_sorted_freq[0:50]))), neg_sorted_names[0:50])
axs[1].bar(list(range(len(pos_sorted_freq[0:50]))), pos_sorted_freq[0:50])
axs[1].set_xticks(list(range(len(pos_sorted_freq[0:50]))), pos_sorted_names[0:50])
plt.show()