In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

In [2]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

Using TensorFlow backend.


In [5]:
train_input_dir = '../../../challenges_data/kaggle/20_newsgroups_ciphertext_challenge/20news-bydate-train/'
eval_input_dir = '../../../challenges_data/kaggle/20_newsgroups_ciphertext_challenge/20news-bydate-test/'
vocab_size = 10000
max_doc_len = 1000

In [6]:
import os
# each folder contains documents about the same topic. We'll use folder names as classes
class_names = {name for name in os.listdir(train_input_dir) if os.path.isdir(os.path.join(train_input_dir, name))}
class_names |= {name for name in os.listdir(eval_input_dir) if os.path.isdir(os.path.join(eval_input_dir, name))}

# map class to indexes and vice versa
class_to_idx = {}
idx_to_class = {}
for i,c in enumerate(class_names):
    class_to_idx[c] = i
    idx_to_class[i] = c

In [7]:
train_file_names = []    #training examples ids
train_classes = []    #training examples classes

eval_file_names = []
eval_classes = []

texts = []    # all texts

In [8]:
# read train files
for dirname, _, filenames in os.walk(train_input_dir):
    for filename in filenames:
        train_file_names.append(filename)
        with open(os.path.join(dirname, filename), 'r', encoding='latin-1') as text_file:
            texts.append(text_file.read())
        dirname_tokens = dirname.split('/')
        train_classes.append(dirname_tokens[-1])

In [9]:
# read eval files
for dirname, _, filenames in os.walk(eval_input_dir):
    for filename in filenames:
        eval_file_names.append(filename)
        with open(os.path.join(dirname, filename), 'r', encoding='latin-1') as text_file:
            texts.append(text_file.read())
        dirname_tokens = dirname.split('/')
        eval_classes.append(dirname_tokens[-1])

In [10]:
# tokenizer is used to replace each of the 'vocab_size' most occuring  words a corresponding index
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)

In [11]:
train_ex = texts[:len(train_file_names)]
eval_ex = texts[len(train_file_names):]

# tokenize the text corpses into separate words
train_ex = tokenizer.texts_to_sequences(train_ex)
eval_ex = tokenizer.texts_to_sequences(eval_ex)

# adjust the sequences to the same length
train_X = pad_sequences(train_ex, maxlen=max_doc_len, padding='post', truncating='pre')
eval_X = pad_sequences(eval_ex, maxlen=max_doc_len, padding='post', truncating='pre')

In [12]:
# transform classes from text to integral indices
train_y = list(map(lambda cl : class_to_idx[cl], train_classes))
eval_y = list(map(lambda cl : class_to_idx[cl], eval_classes))

# encode the class indices into one-hot vectors
train_y = to_categorical(train_y)
eval_y = to_categorical(eval_y)

In [None]:
# read and parse the GloVe file 
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [None]:
train_y[:10]