## Create a dictionary from the textual aclImdb dataset and convert the textual dataset to numerical dataset containing the ids of the words in the textual dataset.
Borrowed from http://deeplearning.net/tutorial/code/imdb_preprocess.py

In [1]:
import numpy
import pickle as pkl

from collections import OrderedDict

import glob
import os
import sys

from subprocess import Popen, PIPE

In [2]:
def tokenize(sentences):
    print('Tokenizing..')
    text = "\n".join(sentences)

    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
    tok_text, _ = tokenizer.communicate(bytes(text,'utf-8'))
    toks = tok_text.decode('utf-8').split('\n')[:-1]
    return toks

Tokenizer script is downloaded from https://github.com/moses-smt/mosesdecoder/tree/master/scripts/tokenizer.

**Note:** need to downloaded relevant nonbreaking_prefixes from https://github.com/moses-smt/mosesdecoder/tree/master/scripts/share/nonbreaking_prefixes. If only english is needed, downloading nonbreaking_prefix.en is enough.

**Mose tokenizer add a space in front of marks (e.g. '.', '!')**

In [3]:
tokenizer_cmd = ['./script/tokenizer.perl', '-l', 'en', '-q', '-' ] 
text = ['For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. \
Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer.',
'The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.']
testOutput = tokenize(text)
print(testOutput)

Tokenizing..
['For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem . Imagine a movie where Joe Piscopo is actually funny ! Maureen Stapleton is a scene stealer .', 'The Moroni character is an absolute scream . Watch for Alan &quot; The Skipper &quot; Hale jr. as a police Sgt.']


In [4]:
def build_dict(path):
    """Build dictionary from the training texts."""

    texts = []
    currdir = os.getcwd()
    os.chdir('%s/pos/' % path)
    for ff in glob.glob("*.txt"):
        with open(ff, 'r') as f:
            texts.append(f.readline().strip())
    os.chdir('%s/neg/' % path)
    for ff in glob.glob("*.txt"):
        with open(ff, 'r') as f:
            texts.append(f.readline().strip())
    os.chdir(currdir)

    sentences = tokenize(texts)

    print('Building dictionary..')
    wordcount = dict()
    for ss in sentences:
        words = ss.strip().lower().split()
        for w in words:
            if w not in wordcount:
                wordcount[w] = 1
            else:
                wordcount[w] += 1

    counts = list(wordcount.values())
    keys = list(wordcount.keys())

    sorted_idx = numpy.argsort(counts)[::-1]

    worddict = dict()

    for idx, ss in enumerate(sorted_idx):
        worddict[keys[ss]] = idx+2  # leave 0 and 1 (UNK)
 
    print(numpy.sum(counts), ' total words ', len(keys), ' unique words')

    return worddict

In [5]:
data_path = '/Users/lifa08/Local_documents/Machine_Learning/Miniproject_test/aclImdb/'
dictionary = build_dict(os.path.join(data_path, 'train'))

Tokenizing..
Building dictionary..
7113750  total words  101743  unique words


In [6]:
def word_to_id(path, dictionary):
    """ Convert text words into their corresponding ids in the dictionary"""
    texts = []
    currdir = os.getcwd()
    os.chdir(path)
    for ff in glob.glob("*.txt"):
        with open(ff, 'r') as f:
            texts.append(f.readline().strip())
    os.chdir(currdir)
    """
    seqs_text = []
    sentenceCount = 0
    textCount = 0
    for text in texts:
        #print(text)
        words = tokenize_sentences(text)
        seqs_sentence = []
        for idx, ss in enumerate(words):
            word = ss.strip().lower()
            wordfound = False
            for (k, v) in dictionary.items():
                if(k == word):
                    seqs_sentence.append(v)  
                    wordfound = True
                    break
            if(wordfound == False):
                seqs_sentence.append(1) 
            #sentenceCount += 1
        seqs_text.append(seqs_sentence)
        #textCount +=1
        #if(textCount >= 2): break
    print(seqs_text)
    return seqs_text
    """
    sentences = tokenize(texts)

    seqs = [None] * len(sentences)
    for idx, ss in enumerate(sentences):
        words = ss.strip().lower().split()
        seqs[idx] = [dictionary[w] if w in dictionary else 1 for w in words]

    return seqs

In [7]:
train_x_pos = word_to_id(data_path+'train/pos', dictionary)
print(numpy.array(train_x_pos).shape)
print(train_x_pos[0])

Tokenizing..
(12500,)
[23, 6, 25, 18, 225, 70, 1179, 50, 263, 35, 6, 189, 7, 902, 4332, 3514, 23, 17, 1520, 4, 834, 6, 25, 132, 882, 14843, 10, 178, 180, 40, 7230, 14931, 10, 6, 151, 20385, 4, 2, 33234, 120, 10, 45, 1531, 1967, 4, 121, 23, 1537, 19, 2, 18972, 19, 8530, 4407, 22, 6, 570, 7976]


In [8]:
def text_to_ids(path):
    dictionary = build_dict(os.path.join(path, 'train'))

    train_x_pos = word_to_id(path+'train/pos', dictionary)
    train_x_neg = word_to_id(path+'train/neg', dictionary)
    train_x = train_x_pos + train_x_neg
    train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)

    test_x_pos = word_to_id(path+'test/pos', dictionary)
    test_x_neg = word_to_id(path+'test/neg', dictionary)
    test_x = test_x_pos + test_x_neg
    test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)

    f = open('imdb.pkl', 'wb')
    pkl.dump((train_x, train_y), f, -1)
    pkl.dump((test_x, test_y), f, -1)
    f.close()

    f = open('imdb.dict.pkl', 'wb')
    pkl.dump(dictionary, f, -1)
    f.close()

In [9]:
text_to_ids(data_path)

Tokenizing..
Building dictionary..
7113750  total words  101743  unique words
Tokenizing..
Tokenizing..
Tokenizing..
Tokenizing..
