## Loading Data and Preprocessing

In [17]:
import numpy as np
import os, sys
import pdb
import matplotlib
import matplotlib.pyplot as plt
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer

TRAIN_PATH = '../aclImdb/train/'
TEST_PATH = '../aclImdb/test/'

# each data sample (review) is a document of words
# each review is either negative (0) or positive (1)
reviews_train = load_files(TRAIN_PATH)
data_train, targets_train = reviews_train.data, reviews_train.target
print("Number of reviews: {}".format(type(data_train))) # 25000
print("Samples per class (training): {}".format(np.bincount(targets_train)))

reviews_test = load_files(TEST_PATH)
data_test, targets_test = reviews_test.data, reviews_test.target
print("Number of reviews: {}".format(type(data_train))) # 25000
print("Samples per class (training): {}".format(np.bincount(targets_train)))

# remove line breakers
data_train = [review.replace(b"<br />", b" ") for review in data_train]
data_test = [review.replace(b"<br />", b" ") for review in data_test]

print("Loading Completed!")

Number of reviews: <class 'list'>
Samples per class (training): [12500 12500]
Number of reviews: <class 'list'>
Samples per class (training): [12500 12500]
Loading Completed!


## Vectorize reviews

In [14]:
vect = CountVectorizer()
vect.fit(data_train)
feature_names = vect.get_feature_names()
print("Size of vocabulary: {}".format(len(feature_names)))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))

X_train = vect.transform(data_train) # 25000 x 74849
X_test = vect.transform(data_test)
print(X_train.shape, X_test.shape)

Size of vocabulary: 74849
Features 20010 to 20030:
['dratted', 'draub', 'draught', 'draughts', 'draughtswoman', 'draw', 'drawback', 'drawbacks', 'drawer', 'drawers', 'drawing', 'drawings', 'drawl', 'drawled', 'drawling', 'drawn', 'draws', 'draza', 'dre', 'drea']
(25000, 74849) (25000, 74849)


## Training Validation Set Split

In [20]:
from sklearn.model_selection import train_test_split
X_train_real, X_val, targets_train_real, targets_val = train_test_split(X_train, targets_train, test_size=0.2, random_state=42)
print(X_train_real.shape, X_val.shape)

(20000, 74849) (5000, 74849)


## Baseline (Logistic regresssion)

In [22]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# scores = cross_val_score(LogisticRegression(solver='lbfgs',max_iter=200), X_train, targets_train, cv=5)
# print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))
clf = LogisticRegression(random_state=0).fit(X_train_real, targets_train_real)
pred = clf.predict(X_test)
# print((targets_test == pred)
print((pred == targets_test).sum() / len(pred))

0.86432


## Neural Network

In [26]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10, 10, 20, 20, 10), random_state=1)
clf.fit(X_train_real, targets_train_real)
pred = clf.predict(X_test)
print((pred == targets_test).sum() / len(pred))

0.85832


## MLP Architecture Search

In [34]:
arch_list = [[], [], [], [], [] ,[]]
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(8,8,16,16,8,8), activation='identity', random_state=1)
clf.fit(X_train_real, targets_train_real)
pred = clf.predict(X_val)
print("Validation accuracy:", (pred == targets_val).sum() / len(pred))
pred = clf.predict(X_test)
print("Test accuracy:", (pred == targets_test).sum() / len(pred))

Validation accuracy: 0.8666
Test accuracy: 0.84684


## PCA

In [43]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from sklearn.random_projection import sparse_random_matrix

svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
svd.fit(X_train_real)
print(svd.explained_variance_ratio_.sum())
X_train_PCA = svd.transform(X_train_real)
print(X_train_PCA.shape)

svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
svd.fit(X_val)
print(svd.explained_variance_ratio_.sum())
X_val_PCA = svd.transform(X_val)
print(X_val_PCA.shape)

svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
svd.fit(X_test)
print(svd.explained_variance_ratio_.sum())
X_test_PCA = svd.transform(X_test)
print(X_test_PCA.shape)

0.8867670410843221
(20000, 1000)
0.9272125141210419
(5000, 1000)
0.8876231102154288
(25000, 1000)


In [None]:
c = np.random.randint(1, 5, size=2)

fig = plt.figure()
for i, label in enumerate([0, 1]):
    print(label)
    plt.scatter(X_train_PCA[targets_train_real == label, 0], X_train_PCA[targets_train_real == label, 1], label=str(label, 'utf-8'), cmap="Spectral", marker='.', alpha=0.5)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.grid(linestyle='--')
plt.title('PCA of Pancancer dataset')
plt.show();
# fig.savefig("pca_larger.png", dpi=fig.dpi)

## Visualization

In [4]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pandas as pd
from PIL import Image

In [None]:
vect_fit = vect.fit_transform(data_train)

neg_include = targets_train == 0
pos_include = targets_train == 1
neg_word_freq = np.array(vect_fit[neg_include].sum(axis=0))[0]
pos_word_freq = np.array(vect_fit[pos_include].sum(axis=0))[0]
assert neg_include.sum() + pos_include.sum() == len(targets_train)

neg_sorted_names = [name for _, name in sorted(zip(neg_word_freq, feature_names))]
pos_sorted_names = [name for _, name in sorted(zip(pos_word_freq, feature_names))]
neg_sorted_freq = sorted(neg_word_freq)
pos_sorted_freq = sorted(pos_word_freq)

# Some wrongly spelled words in reviews
print(neg_sorted_names[20000:20010])
print(pos_sorted_names[20000:20010])

[u'whelan', u'whelming', u'whereever', u'whetted', u'whic', u'whick', u'whidbey', u'whig', u'whigham', u'whiile']
[u'anbuselvan', u'anchorpoint', u'anch\xeda', u'ancona', u'anda', u'andaaz', u'andalusia', u'anddd', u'andelou', u'andersen']


In [None]:
# make plot
# print(list(range(len(neg_sorted_freq[0:50]))))
fig, axs = plt.subplots(1,2)
axs[0].bar(list(range(len(neg_sorted_freq[0:10]))), neg_sorted_freq[0:10])
# axs[0].set_xticks(list(range(len(neg_sorted_freq[0:50]))), neg_sorted_names[0:50])
# axs[1].bar(list(range(len(pos_sorted_freq[0:50]))), pos_sorted_freq[0:50])
# axs[1].set_xticks(list(range(len(pos_sorted_freq[0:50]))), pos_sorted_names[0:50])
plt.show()