In [4]:
import tensorflow as tf
import numpy as np
import os
import re
import csv
from IPython import display as ipythondisplay
from tqdm import tqdm
import nltk
from gensim.models import Word2Vec

nltk.download('stopwords')
from nltk.corpus import stopwords

print(tf.__version__)

2.7.0


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\evely\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
STOPWORDS = set(stopwords.words('english'))

In [6]:
articles = []
labels = []
label_dict = {}
reverse_label_dict = {}

label_ind = 0
with open("bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        if row[0] not in label_dict:
            label_dict[row[0]] = label_ind
            reverse_label_dict[label_ind] = row[0]
            label_ind += 1

        labels.append(label_dict[row[0]])
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
            article = article.replace(' ', ' ')
        articles.append([word for word in re.split(',|_|-|\!| |\.|\(|\)|\:|\?', article) if word != ''])

a = np.array(labels)
b = np.zeros((a.size, a.max()+1))
b[np.arange(a.size),a] = 1
labels = b

print(len(labels), labels[:10])
print(len(articles), articles[0])

2225 [[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]]
2225 ['tv', 'future', 'hands', 'viewers', 'home', 'theatre', 'systems', 'plasma', 'high', 'definition', 'tvs', 'digital', 'video', 'recorders', 'moving', 'living', 'room', 'way', 'people', 'watch', 'tv', 'radically', 'different', 'five', 'years', 'time', 'according', 'expert', 'panel', 'gathered', 'annual', 'consumer', 'electronics', 'show', 'las', 'vegas', 'discuss', 'new', 'technologies', 'impact', 'one', 'favourite', 'pastimes', 'us', 'leading', 'trend', 'programmes', 'content', 'delivered', 'viewers', 'via', 'home', 'networks', 'cable', 'satellite', 'telecoms', 'companies', 'broadband', 'service', 'providers', 'front', 'rooms', 'portable', 'devices', 'one', 'talked', 'about', 'technologies', 'ces', 'digital', 'personal', 'video', 'recorders', 'dvr', 'pvr', 'set', 'top', 'boxes', 'like', 'us', 'tivo'

In [7]:
model = Word2Vec(articles, min_count=1)
print(model)
words = list(model.wv.key_to_index.keys())
# print(words)
# print(model.wv['mr'])
# save model
print(model.vector_size)
model.save('model.bin')
# load model
# new_model = Word2Vec.load('model.bin')

Word2Vec(vocab=30495, vector_size=100, alpha=0.025)
100


In [8]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
padding_value = np.zeros(model.vector_size)
training_portion = .8

In [10]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_sequences = [[model.wv[word] for word in article] for article in train_articles]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_sequences = [[model.wv[word] for word in article] for article in validation_articles]
validation_labels = labels[train_size:]

In [44]:
def pad(x, maxlen, value):
    new_x = np.array([np.array([value for w in range(maxlen)]) for a in x])
    for i in range(len(x)):
        if len(x[i]) <= maxlen:
            new_x[i, :len(x[i])] = np.array(x[i])
        else:
            new_x[i] = np.array(x[i][:maxlen])
    return new_x

In [48]:
train_padded = pad(train_sequences, maxlen=max_length, value=padding_value)
validation_padded = pad(validation_sequences, maxlen=max_length, value=padding_value)

(1780, 200, 100)


In [22]:
def get_batch(articles, labels, batch_size):
    idx = np.random.choice(len(articles), batch_size)

    x_batch = np.array([articles[i] for i in idx])
    y_batch = np.array([labels[i] for i in idx])
  
    return x_batch, y_batch

In [16]:
def build_model(rnn_units, dense_units, output_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(rnn_units)),
        tf.keras.layers.Dense(dense_units, activation='relu'),
        tf.keras.layers.Dense(output_units, activation='softmax')
    ])

    return model

In [17]:
def compute_loss(labels, logits):
  loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True) # TODO
  return loss

In [18]:
# Optimization parameters:
num_training_iterations = 2000  # Increase this to train longer
batch_size = 4  # Experiment between 1 and 64
seq_length = 100  # Experiment between 50 and 500
learning_rate = 5e-3  # Experiment between 1e-5 and 1e-1

# Model parameters
rnn_units = 1024
dense_units = 256
embedding_dim = 256
num_categories = 10

# Checkpoint location: 
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "my_ckpt")

In [23]:
model = build_model(rnn_units=rnn_units, dense_units=dense_units, output_units=num_categories)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

@tf.function
def train_step(x, y): 
  with tf.GradientTape() as tape:
    y_hat = model(x)
    loss = compute_loss(y, y_hat)

  grads = tape.gradient(loss, model.trainable_variables)
  
  optimizer.apply_gradients(zip(grads, model.trainable_variables))
  return loss

In [None]:
history = []
plotter = mdl.util.PeriodicPlotter(sec=2, xlabel='Iterations', ylabel='Loss')
if hasattr(tqdm, '_instances'): tqdm._instances.clear() # clear if it exists

for iter in tqdm(range(num_training_iterations)):

  # Grab a batch and propagate it through the network
  x_batch, y_batch = get_batch(sentences, batch_size)
  loss = train_step(x_batch, y_batch)

  # Update the progress bar
  history.append(loss.numpy().mean())
  plotter.plot(history)

  # Update the model with the changed weights!
  if iter % 100 == 0:     
    model.save_weights(checkpoint_prefix)
    
# Save the trained model and the weights
model.save_weights(checkpoint_prefix)