In [30]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import pyprind
import os
import sys
import re

In [2]:
# the IMDB dataset is in 4 folders, train/pos, train/neg, test/pos, test/neg
# each folder contains a few txt files, each file contains the review text
# this cell reads all files from all folder, and concat to one csv

# # path of data folder
# basepath = '../data/aclImdb'

# # encode pos folder to 1, neg folder to 0
# labels = {'pos':1, 'neg': 0}

# # init progress to 50000 iterations (number of sample data)
# pbar = pyprind.ProgBar(50000, stream=sys.stdout)

# df = pd.DataFrame()

# for dataset in ('test', 'train'):
#     for label in ('pos', 'neg'):
#         # construct the folder path of the data
#         path = os.path.join(basepath, dataset, label)

#         for file in sorted(os.listdir(path)):
#             with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
#                 # read the review text of each file
#                 txt = infile.read()

#             # append the review text to df, with the encoded label
#             df = df.append([[txt, labels[label]]], ignore_index=True)

#             pbar.update()  # progress visualization

# df.columns = ['review', 'sentiment']

# # shuffle data
# np.random.seed(0)
# df = df.reindex(np.random.permutation(df.index))

# # save to csv
# df.to_csv('../data/aclImdb/imdb.csv', index=False, encoding='utf-8')

In [3]:
df = pd.read_csv('../data/aclImdb/imdb.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


# Text Word Vectorization

In [4]:
# sample word data to vectorize
docs = np.array(['The sun is shining', 'The weather is sweet', 'The sun is shining, the weather is sweet, and one and one is two'])

# bag-of-words convert each word to a integer
count = CountVectorizer()
bag = count.fit_transform(docs)
print(count.vocabulary_)

# and it stores each sample as a count of each word
print(bag.toarray())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [5]:
# some common words appear in all texts and are not specific for each label, use tf-idf to give these words less weight
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

np.set_printoptions(precision=2)

# use tf-idf on the word counts
print(tfidf.fit_transform(bag).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


# Text Data Preprocessing

In [6]:
# text data usually contain non-text data, such as html tags, etc
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [7]:
# function to remove these
def preprocessor(text):
    # remove html tags
    text = re.sub('<[^>]*>', '', text)

    # get possible emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)

    # remove all non-words, convert texts to lowercase, append emoticon to last
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))

    return text

# test the function
preprocessor(df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [8]:
# preprocess all data in the dataset
df['review'] = df['review'].apply(preprocessor)

In [9]:
# tokenize the texts into words
def tokenizer(text):
    return text.split()

# tokenize and stem the words
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

# test
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [10]:
# can also remove stop words (common words without specific meaning), tf-idf somewhat does this by assigning common words less weight
nltk.download('stopwords')
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'like', 'run', 'run', 'lot']

# Training

In [11]:
# split train test sets
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [12]:
# TfidfVectorizer = CountVectorizer() + TfidfTransformer = counts the words and assign weights based on word importance (occurence in each sample)
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

# hyperparameters to optimize for the vectorizer and LR
param_grid = [
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [None],
        'vect__tokenizer': [tokenizer, tokenizer_porter],  # just tokensize word, or also stem
        'clf__penalty': ['l2'],
        'clf__C': [1.0, 10.0]
    },
    {
        # these settings do not use tf-idf, only tf, so need to check removing stop words
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words': [stop, None],  # remove stop word or not
        'vect__tokenizer': [tokenizer],
        'vect__use_idf':[False],
        'vect__norm':[None],
        'clf__penalty': ['l2'],
        'clf__C': [1.0, 10.0]
    },
]

# training pipeline to first tfidf vectorize the words, then logistic regression
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(solver='liblinear'))])  # liblinear is better than lbfgs for great datasets

# grid search cross validation tune the pipeline
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [13]:
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x000001D1F5C133A0>}


In [14]:
print(f'CV Accuracy: {gs_lr_tfidf.best_score_:.3f}')
# accuracy on test set for best parameters
clf = gs_lr_tfidf.best_estimator_
print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')

CV Accuracy: 0.897
Test Accuracy: 0.899


# Out-of-Core Learning

In [24]:
# tokenizer function that removes non-words and stop words
def tokenizerFull(text):
    # remove html tags
    text = re.sub('<[^>]*>', '', text)

    # get possible emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)

    # remove all non-words, convert texts to lowercase, append emoticon to last
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))

    # remove stop words
    tokenized = [w for w in text.split() if w not in stop]

    return tokenized

# function to read one sample at a time from data
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header

        # read one sample at a time
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label  # stream out

# test streaming first sample
next(stream_docs(path='../data/aclImdb/imdb.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [18]:
# function to get a mini batch of data
def get_minibatch(doc_stream, batch_size):
    X, y = [], []

    try:
        # go through each sample, append to batch feature and label
        for _ in range(batch_size):
            text, label = next(doc_stream)
            X.append(text)
            y.append(label)
    except StopIteration:
        return None, None

    return X, y

In [27]:
# both CountVectorizer() and Tf-idf needs the whole dataset, and is not useable for mini batches in out-of-core learning
# use HashingVectorizer instead, which vectorizes word by using hashing and does not need whole dataset
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizerFull)

# use SGD Logistric Regression classifer which use SGD on mini batches of data
clf = SGDClassifier(loss='log_loss', random_state=1)

# data streaming pipeline
doc_stream = stream_docs(path='../data/aclImdb/imdb.csv')

pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])

# train 45 iterations of 1000 samples each = 45000 training samples
for _ in range(45):
    # get the next mini batch
    X_train, y_train = get_minibatch(doc_stream, batch_size=1000)
    if not X_train:
        break

    # vectorize the text, HashingVectorizer no need fit
    X_train = vect.transform(X_train)
    # partial fit since mini batch
    clf.partial_fit(X_train, y_train, classes=classes)

    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:20


In [28]:
# use the last 5000 sample as test set
X_test, y_test = get_minibatch(doc_stream, batch_size=5000)

# vectorize text
X_test = vect.transform(X_test)

print(f'Accuracy: {clf.score(X_test, y_test):.3f}')

Accuracy: 0.868


In [29]:
# done with testing, can fit the model with the last test set
clf = clf.partial_fit(X_test, y_test)

# Topic Modeling

In [31]:
# Topic modeling is an unsupervised task of assigning topics to words (similar to clustering)

df = pd.read_csv('../data/aclImdb/imdb.csv', encoding='utf-8')

# vectorize the word count, max_df and max_features are hyperparameters that can be tuned
count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000)
X = count.fit_transform(df['review'].values)

# use latent dirichlet allocation to assign 10 topics
lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')  # batch means train whole dataset at once
X_topics = lda.fit_transform(X)

In [32]:
# check the top words in each clustered topic
n_top_words = 5
feature_names = count.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {(topic_idx + 1)}:')
    print(' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic 1:
worst minutes awful script stupid
Topic 2:
family mother father children girl
Topic 3:
american war dvd music tv
Topic 4:
human audience cinema art sense
Topic 5:
police guy car dead murder
Topic 6:
horror house sex girl woman
Topic 7:
role performance comedy actor performances
Topic 8:
series episode war episodes tv
Topic 9:
book version original read novel
Topic 10:
action fight guy guys cool
