In [1]:
# libraries needed for preprocessing
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
import pickle
import json

In [2]:
# upload news article data 
with open('../data/nytfox_collate_v2.json','rb') as f:
    data = json.load(f)


# separate content and title data into separate lists 
content_arr = [item['content'] for item in data]
title_arr = [item['title'] for item in data]
num_samples = len(content_arr)

# shuffle data for test and train sets 
np.random.seed(2470)
idx = np.arange(0,num_samples)
np.random.shuffle(idx)

test_split = 0.2
num_test_samples = int(test_split*num_samples)

# create train and test sets for content and titles 
temp_content_arr = np.array(content_arr)[idx]
temp_title_arr = np.array(title_arr)[idx]

train_content = (temp_content_arr.tolist())[:-num_test_samples]
test_content = (temp_content_arr.tolist())[-num_test_samples:]

train_title = (temp_title_arr.tolist())[:-num_test_samples]
test_title = (temp_title_arr.tolist())[-num_test_samples:]


In [14]:
check = [content for content in train_content if len(content.split())<256]
len(check)

1490

In [3]:
from tensorflow.keras.layers import TextVectorization

# create vectorizers that map words to unique indexes in the vocab 
content_vectorizer = TextVectorization(max_tokens=100000, split='whitespace', output_mode='int', 
                                       standardize='lower_and_strip_punctuation',
                                       output_sequence_length=256)

title_vectorizer = TextVectorization(max_tokens=15000, split='whitespace', output_mode='int',
                                     standardize='lower_and_strip_punctuation',
                                     output_sequence_length=32)

train_content_ds = tf.data.Dataset.from_tensor_slices(train_content).batch(128)
train_title_ds = tf.data.Dataset.from_tensor_slices(train_title).batch(128)


content_vectorizer.adapt(train_content_ds)
title_vectorizer.adapt(train_title_ds)

# create dictionaries that map unique words to indexes for both content and title data 
content_vocab = content_vectorizer.get_vocabulary()
content_word_index = dict(zip(content_vocab, range(len(content_vocab))))

title_vocab = title_vectorizer.get_vocabulary()
title_word_index = dict(zip(title_vocab, range(len(title_vocab))))
index_to_word = dict(zip(range(len(title_vocab)),title_vocab))


2023-05-01 15:11:50.229755: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [4]:
# initalize the glove embedding space 
path_to_glove = 'glove.6B/glove.6B.100d.txt'
embeddings_index = {}

# dictionary with word: glove embedding 
with open(path_to_glove) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [5]:
np.random.seed(2470)
embedding_size = 100

# create randome vectors for the start and stop tokens 
start_embedding = np.random.normal(size=(embedding_size))
stop_embedding = np.random.normal(size=(embedding_size))

In [6]:
# function to add positional_encodings to word_embeddings
def positional_encoding(window_size, embedding_size):
   
    embedding_size = embedding_size/2
    ## Generate a range of positions and depths 
    positions = np.arange(window_size)[:, np.newaxis]    # (seq, 1)
    depths = np.arange(embedding_size)[np.newaxis, :]/embedding_size  # (1, depth)
    ## Compute range of radians to take the sine and cosine of.
    angle_rates = 1 / (10000**depths)               # (1, depth)
    angle_rads = positions * angle_rates            # (pos, depth)
    pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1) 
    ## This serves as offset for the Positional Encoding
    return tf.cast(pos_encoding, dtype=tf.float32)

# function to create embedded matrix for train and test data 
def create_embeddings(data, glove, max_emb_len, emb_size, dataset_name):
    
    # initalize embedding tensor for data 
    embedding_tensor = np.zeros(shape=(len(data),max_emb_len,emb_size))
    
    # embed each article in the data 
    for j, article in enumerate(data):
        # get embedding for each word in data 
        for i, word in enumerate(article.split()):
            # end embedding at maximum length 
            if i==max_emb_len:
                break
            # add start embedding as first row of each article block 
            # ARE WE MISSING THE FIRST WORD?
            if i==0:
                embedding_tensor[j][i] = start_embedding
            # add stop embedding as last row of each article block 
            elif i==max_emb_len-1:
                embedding_tensor[j][i] = stop_embedding
            # grab the embedding of the word from glove 
            else:
                embedding_tensor[j][i] = glove.get(word, np.zeros(emb_size))
    
    emb_with_pos = embedding_tensor + positional_encoding(max_emb_len,emb_size)
    
    print(f"Shape of {dataset_name} embedding:", emb_with_pos.shape)
    
    return emb_with_pos

# function to create token labels for titles 
def create_token_labels(title_data,title_vectorizer, dataset):
    
    title_labels = []

    for title in title_data:
        title_labels.append(title_vectorizer(title).numpy())

    title_labels = np.array(title_labels).reshape(len(title_data),-1,1)
    
    print(f'Shape of {dataset} token labels:', title_labels.shape)
    
    return title_labels 

In [7]:
# use embedding function to create embedding matrix of all datasets 
train_content_emb = create_embeddings(train_content,embeddings_index,256,100,'train_content')
test_content_emb = create_embeddings(test_content,embeddings_index,256,100,'test_content')

train_title_emb = create_embeddings(train_title,embeddings_index,32,100,'train_title')
test_title_emb = create_embeddings(test_title,embeddings_index,32,100,'test_title')

Shape of train_content embedding: (17292, 256, 100)
Shape of test_content embedding: (4323, 256, 100)
Shape of train_title embedding: (17292, 32, 100)
Shape of test_title embedding: (4323, 32, 100)


In [8]:
# use token function to create title labels for loss function
train_title_labels = create_token_labels(train_title,title_vectorizer,'train')
test_title_labels = create_token_labels(test_title,title_vectorizer,'test')

Shape of train token labels: (17292, 32, 1)
Shape of test token labels: (4323, 32, 1)


In [9]:
# save data as pickle file 
with open('train_content_embeddings.pkl','wb+') as f:
    pickle.dump(train_content_emb, f)

with open('train_title_embeddings.pkl','wb+') as f:
    pickle.dump(train_title_emb, f)
    
with open('test_content_embeddings.pkl', 'wb+') as f:
    pickle.dump(test_content_emb, f)

with open('test_title_embeddings.pkl','wb+') as f:
    pickle.dump(test_title_emb, f)
    
with open('train_title_labels.pkl','wb+') as f:
    pickle.dump(train_title_labels, f)
    
with open('test_title_labels.pkl','wb+') as f:
    pickle.dump(test_title_labels, f)
    
