In [147]:
# libraries needed for preprocessing
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
import pickle
import json

In [165]:
# upload news article data 
with open('../data/nytfox_collate_v2.json','rb') as f:
    data = json.load(f)


# separate content and title data into separate lists 
content_arr = [item['content'] for item in data]
title_arr = [item['title'] for item in data]
num_samples = len(content_arr)

# shuffle data for test and train sets 
np.random.seed(2470)
idx = np.arange(0,num_samples)
np.random.shuffle(idx)

test_split = 0.2
num_test_samples = int(test_split*num_samples)

# create train and test sets for content and titles 
train_content = content_arr[:-num_test_samples]
test_content = content_arr[-num_test_samples:]
train_title = title_arr[:-num_test_samples]
test_title = title_arr[-num_test_samples:]

import pandas as pd
df = pd.DataFrame()
df["corpus"] = test_content
df["label"] = test_title

df.head(2)

Unnamed: 0,corpus,label
0,internet ad agency cites difficult outlook the...,technology briefing
1,when south koreas president yoon suk yeol touc...,japan hosts south koreas leader in tokyo as si...


In [149]:
train_title[0]

'conservative groups rejoice after house passes major energy package'

In [150]:
from tensorflow.keras.layers import TextVectorization

# create vectorizers that map words to unique indexes in the vocab 
content_vectorizer = TextVectorization(max_tokens=100000, split='whitespace', output_mode='int', 
                                       standardize='lower_and_strip_punctuation',
                                       output_sequence_length=256)

title_vectorizer = TextVectorization(max_tokens=15000, split='whitespace', output_mode='int',
                                     standardize='lower_and_strip_punctuation',
                                     output_sequence_length=32)

train_content_ds = tf.data.Dataset.from_tensor_slices(train_content).batch(128)
train_title_ds = tf.data.Dataset.from_tensor_slices(train_title).batch(128)


content_vectorizer.adapt(train_content_ds)
title_vectorizer.adapt(train_title_ds)

# create dictionaries that map unique words to indexes for both content and title data 
content_vocab = content_vectorizer.get_vocabulary()
content_word_index = dict(zip(content_vocab, range(len(content_vocab))))

title_vocab = title_vectorizer.get_vocabulary()
title_word_index = dict(zip(title_vocab, range(len(title_vocab))))
index_to_word = dict(zip(range(len(title_vocab)),title_vocab))


In [151]:
# initalize the glove embedding space 
path_to_glove = 'glove.6B/glove.6B.100d.txt'
embeddings_index = {}

# dictionary with word: glove embedding 
with open(path_to_glove) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))
print('Data format for word ("the"):',embeddings_index.get('the'))


Found 400000 word vectors.
Data format for word ("the"): [-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   0.016215
 -0.017099 -0.38984   0.87424  -0

In [152]:
# ASK IF HE USES THIS ##################

# number of possible tokens is content_vocab + start and stop 
num_tokens = len(content_vocab) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
content_embedding_matrix = np.zeros((num_tokens, embedding_dim))

for word, i in content_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        content_embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 67966 words (32034 misses)


In [153]:
# do the same for titles 
num_tokens = len(title_vocab) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
title_embedding_matrix = np.zeros((num_tokens, embedding_dim))

for word, i in title_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        title_embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 13790 words (1210 misses)


In [154]:
np.random.seed(2470)
embedding_size = title_embedding_matrix.shape[1]

# create randome vectors for the start and stop tokens 
start_embedding = np.random.normal(size=(embedding_size))
stop_embedding = np.random.normal(size=(embedding_size))

In [155]:
# function to add positional_encodings to word_embeddings
def positional_encoding(window_size, embedding_size):
   
    embedding_size = embedding_size/2
    ## Generate a range of positions and depths 
    positions = np.arange(window_size)[:, np.newaxis]    # (seq, 1)
    depths = np.arange(embedding_size)[np.newaxis, :]/embedding_size  # (1, depth)
    ## Compute range of radians to take the sine and cosine of.
    angle_rates = 1 / (10000**depths)               # (1, depth)
    angle_rads = positions * angle_rates            # (pos, depth)
    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1) 
    ## This serves as offset for the Positional Encoding
    return tf.cast(pos_encoding, dtype=tf.float32)

# function to create embedded matrix for train and test data 
def create_embeddings(data, glove, max_emb_len, emb_size, dataset_name):
    
    # initalize embedding tensor for data 
    embedding_tensor = np.zeros(shape=(len(data),max_emb_len,emb_size))
    
    # embed each article in the data 
    for j, article in enumerate(data):
        # get embedding for each word in data 
        for i, word in enumerate(article.split()):
            # end embedding at maximum length 
            if i==max_emb_len:
                break
            # add start embedding as first row of each article block 
            # ARE WE MISSING THE FIRST WORD?
            if i==0:
                embedding_tensor[j][i] = start_embedding
            # add stop embedding as last row of each article block 
            elif i==train_content_seq-1:
                embedding_tensor[j][i] = stop_embedding
            # grab the embedding of the word from glove 
            else:
                embedding_tensor[j][i] = glove.get(word, np.zeros(emb_size))
    
    emb_with_pos = embedding_tensor + positional_encoding(max_emb_len,emb_size)
    
    print(f"Shape of {dataset_name} embedding:", emb_with_pos.shape)
    
    return emb_with_pos

# function to create token labels for titles 
def create_token_labels(title_data,title_vectorizer, dataset):
    
    title_labels = []

    for title in title_data:
        title_labels.append(title_vectorizer(title).numpy())

#     title_labels = np.array(title_labels).reshape(len(title_data),-1,1)
    
#     print(f'Shape of {dataset} token labels:', title_labels.shape)
    
    return title_labels 

In [156]:
# use embedding function to create embedding matrix of all datasets 
train_content_emb = create_embeddings(train_content,embeddings_index,256,100,'train_content')
test_content_emb = create_embeddings(test_content,embeddings_index,256,100,'test_content')

train_title_emb = create_embeddings(train_title,embeddings_index,32,100,'train_title')
test_title_emb = create_embeddings(test_title,embeddings_index,32,100,'test_title')

Shape of train_content embedding: (17292, 256, 100)
Shape of test_content embedding: (4323, 256, 100)
Shape of train_title embedding: (17292, 32, 100)
Shape of test_title embedding: (4323, 32, 100)


In [161]:
# use token function to create title labels for loss function
print(train_title[0])
train_title_labels = create_token_labels(train_title,title_vectorizer,'train')
print(train_title_labels[0])
# test_title_labels = create_token_labels(test_title,title_vectorizer,'test')

conservative groups rejoice after house passes major energy package



KeyboardInterrupt



In [162]:
train_title_labels[0]

array([1207,  326, 4152,   14,   32,  876,  239,   68, 1443,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [164]:
this = train_title_labels[0].reshape((32,))

s = ''
for tok in this:
    s += index_to_word[tok]
    s += ' '

s

'conservative groups rejoice after house passes major energy package                        '

In [143]:
train_title[0]

'conservative groups rejoice after house passes major energy package'

In [144]:
this = test_title_labels[0].reshape((32,))
s = ''
for tok in this:
    s += index_to_word[tok]
    s += ' '

s
    

'technology briefing                               '

In [100]:
# save data as pickle file 
with open('train_content_embeddings.pkl','wb+') as f:
    pickle.dump(train_content_emb, f)

with open('train_title_embeddings.pkl','wb+') as f:
    pickle.dump(train_title_emb, f)
    
with open('test_content_embeddings.pkl', 'wb+') as f:
    pickle.dump(test_content_emb, f)

with open('test_title_embeddings.pkl','wb+') as f:
    pickle.dump(test_title_emb, f)
    
with open('train_title_labels.pkl','wb+') as f:
    pickle.dump(train_title_labels, f)
    
with open('test_title_labels.pkl','wb+') as f:
    pickle.dump(test_title_labels, f)
    


In [102]:
with open('index_to_word.pkl','wb+') as f:
    pickle.dump(index_to_word, f)

In [103]:
# CAN DELETE ???

In [104]:
# max length of content for embeddings
train_content_seq = 256
train_content_embedding = np.zeros(shape=(len(train_content),train_content_seq,embedding_size))

# embedd tokens in each article up until 256 tokens 
for j,article in enumerate(train_content):
    for i,word in enumerate(article.split()):
        # end embedding at maximum length 
        if i==train_content_seq:
            break
        # add start embedding as first row of each article block 
        if i==0:
            train_content_embedding[j][i] = start_embedding
        # add stop embedding as last row of each article block 
        elif i==train_content_seq-1:
            train_content_embedding[j][i] = stop_embedding
        # grab the embedding of the word from glove 
        else:
            train_content_embedding[j][i] = embeddings_index.get(word, np.zeros(embedding_size))
print('Shape of train content embeddings:',train_content_embedding.shape)


Shape of train content embeddings: (17292, 256, 100)


In [106]:
# get embeddings for 
train_title_seq = 32
train_title_embedding = np.zeros(shape=(len(train_title),train_title_seq,embedding_size))

for j,title in enumerate(train_title):
    for i,word in enumerate(title.split()):
        if i==train_title_seq:
            break
        if i==0:
            train_title_embedding[j][i] = start_embedding
        elif i==train_content_seq-1:
            train_title_embedding[j][i] = stop_embedding
        else:
            train_title_embedding[j][i] = embeddings_index.get(word, np.zeros(embedding_size))
train_title_embedding.shape


(17292, 32, 100)

In [108]:
test_content_seq = 256
test_content_embedding = np.zeros(shape=(len(test_content),test_content_seq,embedding_size))

for j,article in enumerate(test_content):
    for i,word in enumerate(article.split()):
        if i==test_content_seq:
            break
        if i==0:
            test_content_embedding[j][i] = start_embedding
        elif i==test_content_seq-1:
            test_content_embedding[j][i] = stop_embedding
        else:
            test_content_embedding[j][i] = embeddings_index.get(word, np.zeros(embedding_size))
test_content_embedding.shape


(4323, 256, 100)

In [109]:
test_title_seq = 32
test_title_embedding = np.zeros(shape=(len(test_title),test_title_seq,embedding_size))

for j,title in enumerate(test_title):
    for i,word in enumerate(title.split()):
        if i==test_title_seq:
            break
        if i==0:
            test_title_embedding[j][i] = start_embedding
        elif i==test_content_seq-1:
            test_title_embedding[j][i] = stop_embedding
        else:
            test_title_embedding[j][i] = embeddings_index.get(word, np.zeros(embedding_size))
test_title_embedding.shape


(4323, 32, 100)

In [110]:
test_title_labels = []

for title in test_title:
    test_title_labels.append(title_vectorizer(title).numpy())
test_title_labels = np.array(test_title_labels).reshape(len(test_title),-1,1)