# Preprocessing notebook

The IMDB dataset http://ai.stanford.edu/~amaas/data/sentiment/ is composed by 50k labaled reviews, splt evenly, and 50 unlabeled reviews. The training and test sets both consist of 25k text comments. The reviews are split up in single files each containing only one line.

In this notebook we will go through the review files and generate a unified dataframe. Then we will preprocess it for BERT training.

In [1]:
import os
import time

from multiprocessing import Pool

os.chdir('../')

import numpy as np
import pandas as pd
import seaborn as sns
import tqdm.notebook as tnb
import matplotlib.pyplot as plt

In [2]:
dataset_dir = '/media/gio/storage/data/imbd_reviews/aclImdb'
train_dir = '/media/gio/storage/data/imbd_reviews/aclImdb/train'
test_dir = '/media/gio/storage/data/imbd_reviews/aclImdb/test'

n_cores = 4

positive_class = 'pos'
negative_class = 'neg'
positive = 1
negative = 0
class_map = {positive_class: positive, negative_class: negative}

train_dir_pos = os.path.join(train_dir, positive_class)
train_dir_neg = os.path.join(train_dir, negative_class)
test_dir_pos = os.path.join(test_dir, positive_class)
test_dir_neg = os.path.join(test_dir, negative_class)

sets = ['train', 'train', 'test', 'test']
classes = [class_map[positive_class], class_map[negative_class], class_map[positive_class], class_map[negative_class]]
file_dirs = [train_dir_pos, train_dir_neg, test_dir_pos, test_dir_neg]
file_dirs_listed = [[os.path.join(i, j) for j in os.listdir(i)] for i in file_dirs]

assert len(file_dirs) == len(file_dirs_listed)

In [3]:
def imdb_read_worker(dir_list, dir_set, dir_class):
    """ Worker method to read the imdb data files and clean them.
    """
    
    cols = ['id', 'comment_text', 'set', 'class', 'score']
    result_df = pd.DataFrame(columns=cols)
    
    sets = [dir_set] * len(dir_list)
    classes = [dir_class] * len(dir_list)
    textids = []
    scores = []
    comments = []
    order_append = [textids, comments, sets, classes, scores]
    
    for filename in dir_list:
        textid, score = os.path.split(filename)[1].split('_')
        textid = int(textid)
        score = int(score.split('.')[0])

        with open(filename, 'r', encoding='utf-8') as infile:
            line = infile.readlines()
            assert len(line) == 1

            line = line[0].strip().replace("<br>", "")
    
        textids.append(textid)
        scores.append(score)
        comments.append(line)
    
    for i in range(len(order_append)):
        result_df[cols[i]] = order_append[i]
    
    return result_df

In [4]:
def imdb_parallel_reader(dir_list, dir_set, dir_class, n_workers=4):
    
    dir_lists = np.array_split(dir_list, n_workers)
    
    inputs = list(zip(dir_lists, dir_set, dir_class))
    assert len(inputs) == n_workers
    
    pool = Pool(processes=n_workers)
    results = [pool.apply(imdb_read_worker, args=inputs[i]) for i in range(n_workers)]
    
    return pd.concat(results)

In [5]:
start_time = time.time()

res = pd.concat([imdb_parallel_reader(file_dirs_listed[i], [sets[i]]*n_cores, [classes[i]]*n_cores, n_workers=n_cores) for i in range(len(file_dirs_listed))])
res = res.set_index(['id', 'set', 'class'], verify_integrity=True)

print('Processing took {:.2f} seconds'.format(time.time() - start_time))

Processing took 1.28 seconds


In [6]:
res

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,comment_text,score
id,set,class,Unnamed: 3_level_1,Unnamed: 4_level_1
3261,train,1,Nay Sayers of this film are likely bitter from...,9
8907,train,1,I saw this movie at midnight on On Demand the ...,8
12059,train,1,"I love ghost stories in general, but I PARTICU...",9
5284,train,1,Father and son communicate very little. IN fac...,9
11807,train,1,After seeing Dick Tracy in the 6.99$ bin at Fu...,7
...,...,...,...,...
6924,test,0,Owen (David Krumholtz) and Chloe (Denise Richa...,1
6361,test,0,Basically the first two Critters movie were al...,4
10184,test,0,Movies about U.F.O.'s are always a nice way to...,1
7081,test,0,"As a documentary, this is laughable in a campy...",4


In [7]:
res.to_csv(
    os.path.join(dataset_dir, 'labeled_data.csv'),
    encoding='utf-8'
)

In [8]:
test = pd.read_csv(os.path.join(dataset_dir, 'labeled_data.csv'), )

In [9]:
test

Unnamed: 0,id,set,class,comment_text,score
0,3261,train,1,Nay Sayers of this film are likely bitter from...,9
1,8907,train,1,I saw this movie at midnight on On Demand the ...,8
2,12059,train,1,"I love ghost stories in general, but I PARTICU...",9
3,5284,train,1,Father and son communicate very little. IN fac...,9
4,11807,train,1,After seeing Dick Tracy in the 6.99$ bin at Fu...,7
...,...,...,...,...,...
49995,6924,test,0,Owen (David Krumholtz) and Chloe (Denise Richa...,1
49996,6361,test,0,Basically the first two Critters movie were al...,4
49997,10184,test,0,Movies about U.F.O.'s are always a nice way to...,1
49998,7081,test,0,"As a documentary, this is laughable in a campy...",4
