In [14]:
from gensim.models import Word2Vec, KeyedVectors
from tensorflow.keras.preprocessing import text_dataset_from_directory
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.optimizers import SGD, Adam
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Sequential
import numpy as np
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
import utils

In [2]:
model = KeyedVectors.load_word2vec_format('data/glove.6B.300d.txt', no_header=True, binary=False)

In [5]:
import os
import sys

TEXT_DATA_DIR = 'data/20_newsgroup'

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

Found 19997 texts.


In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 174074 unique tokens.


In [12]:
MAX_SEQUENCE_LENGTH = max(map(len, sequences))

In [15]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(0.25 * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

Shape of data tensor: (19997, 39726)
Shape of label tensor: (19997, 20)


In [17]:
embedding_layer = make_embedding_layer(model, tokenizer, MAX_SEQUENCE_LENGTH)

In [21]:
em_out = embedding_layer(x_train[:2])

In [22]:
em_out.shape

TensorShape([2, 39726, 300])

In [23]:
conv1_out = Conv1D(128, 5, activation='relu')(em_out)

In [24]:
conv1_out.shape

TensorShape([2, 39722, 128])

In [25]:
maxp1_out = MaxPooling1D(5)(conv1_out)

In [26]:
maxp1_out.shape

TensorShape([2, 7944, 128])

In [28]:
conv2_out = Conv1D(128, 5, activation='relu')(maxp1_out)

In [29]:
conv2_out.shape

TensorShape([2, 7940, 128])

In [30]:
maxp2_out = MaxPooling1D(5)(conv2_out)

In [31]:
maxp2_out.shape

TensorShape([2, 1588, 128])

In [33]:
conv3_out = Conv1D(128, 5, activation='relu')(maxp2_out)

In [34]:
conv3_out.shape

TensorShape([2, 1584, 128])

In [35]:
maxp3_out = MaxPooling1D(35)(conv3_out)  # global max pooling

In [36]:
maxp3_out.shape

TensorShape([2, 45, 128])

In [38]:
flat_out = Flatten()(maxp3_out)

In [39]:
flat_out.shape

TensorShape([2, 5760])

In [40]:
from keras.layers import GlobalAveragePooling1D

In [41]:
globav = GlobalAveragePooling1D()(em_out)

In [42]:
globav.shape

TensorShape([2, 300])