Effective Use of Word Order for Text Categorization with Convolutional Neural Networks
 
https://arxiv.org/abs/1412.1058


In [1]:
import tarfile,os,sys, re
is_eager_exec_init=False

In [3]:
import numpy as np
import tensorflow as tf
if not is_eager_exec_init:
    tf.enable_eager_execution()
    is_eager_exec_init=True
from tensorflow.python.keras.preprocessing.text import Tokenizer, one_hot
import tensorflow.contrib.eager as tfe

In [4]:
test_neg_files = []
test_pos_files = []
train_neg_files = []
train_pos_files = []

with tarfile.open('/data/datasets/aclImdb_v1.tar') as tar:
    for mbr in tar.getmembers():
        matches = re.findall('/test/neg/.*\.txt',mbr.name)
        if len(matches)==1:
            f=tar.extractfile(mbr)
            content=f.read()        
            test_neg_files.append((mbr,content))
        matches = re.findall('/test/pos/.*\.txt',mbr.name)
        if len(matches)==1:
            f=tar.extractfile(mbr)
            content=f.read()  
            test_pos_files.append((mbr, content))
        matches = re.findall('/train/neg/.*\.txt',mbr.name)
        if len(matches)==1:
            f=tar.extractfile(mbr)
            content=f.read()        
            train_neg_files.append((mbr,content))
        matches = re.findall('/train/pos/.*\.txt',mbr.name)
        if len(matches)==1:
            f=tar.extractfile(mbr)
            content=f.read()  
            train_pos_files.append((mbr, content))


In [5]:
train_neg = [txt.decode("utf-8")  for file,txt in train_neg_files]

In [6]:
train_neg[:1]

['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [7]:
# Use spacy to remove stop words first
import spacy
nlp = spacy.load('en')

In [8]:
# test on toy dataset

In [9]:
vocab_nb = 10000

def clean_docs(docs):
    # clean docs by:
    # - removing stop words
    docs_wtho_stop = []
    for raw_doc in docs:
        doc = nlp(raw_doc)
        doc_wtho_stop = ""
        for tok in doc:
            if not tok.is_stop:
                doc_wtho_stop = doc_wtho_stop +tok.text+ " "
        docs_wtho_stop.append(doc_wtho_stop)
    return docs_wtho_stop

In [10]:
def docs_to_sequences(docs, vocab_nb=10000):
    # create the tokenizer
    t = Tokenizer(num_words=vocab_nb)
    # fit the tokenizer on the documents
    t.fit_on_texts(docs)
    # create each doc as a list of integer indices
    docs_seq_int = t.texts_to_sequences(docs)
    doc_nb = len(docs)
    max_length = len(sorted(docs_seq_int,key=len, reverse=True)[0])
    
    # create a dense array of shape (doc_nb_max_length). Each cell contains a int (the indice of word)
    docs_seq_dense = np.array([seq_int+[0]*(max_length-len(seq_int)) for seq_int in docs_seq_int])
    
    
    
    return docs_seq_dense
    

In [11]:
do_test=True
if do_test:
    docs = ['Well done!',
            'Good work, good effort',
            'Great effort guys',
            'nice work my friend',
            'Excellent job my friend!']
    cleaned_docs = clean_docs(docs)
    print(cleaned_docs)
    doc_sequences = docs_to_sequences(cleaned_docs)
    print(doc_sequences)
    

['Well ! ', 'Good work , good effort ', 'Great effort guys ', 'nice work friend ', 'Excellent job friend ! ']
[[ 5  0  0  0]
 [ 1  2  1  3]
 [ 6  3  7  0]
 [ 8  2  4  0]
 [ 9 10  4  0]]


In [12]:
tf.executing_eagerly()
docs_one_hot = tf.one_hot(doc_sequences,depth=10)
docs_one_hot

<tf.Tensor: id=4, shape=(5, 4, 10), dtype=float32, numpy=
array([[[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0

In [13]:


sess = tf.InteractiveSession()
sess.run(docs_one_hot)

RuntimeError: The Session graph is empty.  Add operations to the graph before calling run().