In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization
import pickle
import json

In [None]:
with open('../data/nytfox_collate_v2.json','rb') as f:
    data = json.load(f)

    
content_arr = [item['content'] for item in data]
title_arr = [item['title'] for item in data]
num_samples = len(content_arr)

np.random.seed(2470)
idx = np.arange(0,num_samples)
np.random.shuffle(idx)

test_split = 0.2
num_test_samples = int(test_split*num_samples)

train_content = content_arr[:-num_test_samples]
test_content = content_arr[-num_test_samples:]
train_title = title_arr[:-num_test_samples]
test_title = title_arr[-num_test_samples:]


In [None]:
from tensorflow.keras.layers import TextVectorization

content_vectorizer = TextVectorization(max_tokens=100000, split='whitespace', output_mode='int', 
                                       standardize='lower_and_strip_punctuation',
                                       output_sequence_length=256)

title_vectorizer = TextVectorization(max_tokens=15000, split='whitespace', output_mode='int',
                                     standardize='lower_and_strip_punctuation',
                                     output_sequence_length=32)

train_content_ds = tf.data.Dataset.from_tensor_slices(train_content).batch(128)
train_title_ds = tf.data.Dataset.from_tensor_slices(train_title).batch(128)


content_vectorizer.adapt(train_content_ds)
title_vectorizer.adapt(train_title_ds)

content_vocab = content_vectorizer.get_vocabulary()
content_word_index = dict(zip(content_vocab, range(len(content_vocab))))

title_vocab = title_vectorizer.get_vocabulary()
title_word_index = dict(zip(title_vocab, range(len(title_vocab))))
title_word_rev_index = dict(zip(range(len(title_vocab)), title_vocab))


In [None]:
title_vectorizer(['biden is here'])

In [None]:
title_vectorizer(['biden is there'])

In [None]:
title_word_rev_index[685]

In [None]:
path_to_glove = 'glove.6B/glove.6B.100d.txt'
embeddings_index = {}

with open(path_to_glove) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


In [None]:
import numpy as np

np.random.seed(2470)
embedding_size = title_embedding_matrix.shape[1]
start_embedding = np.random.normal(size=(100))
stop_embedding = np.random.normal(size=(100))

In [None]:
train_content_seq = 256
train_content_embedding = np.zeros(shape=(len(train_content),train_content_seq,embedding_size))

for j,article in enumerate(train_content):
    for i,word in enumerate(article.split()):
        if i==train_content_seq:
            break
        if i==0:
            train_content_embedding[j][i] = start_embedding
        elif i==train_content_seq-1:
            train_content_embedding[j][i] = stop_embedding
        else:
            train_content_embedding[j][i] = embeddings_index.get(word, np.zeros(embedding_size))
train_content_embedding.shape


In [None]:
train_title_seq = 32
train_title_embedding = np.zeros(shape=(len(train_title),train_title_seq,embedding_size))

for j,title in enumerate(train_title):
    for i,word in enumerate(title.split()):
        if i==train_title_seq:
            break
        if i==0:
            train_title_embedding[j][i] = start_embedding
        elif i==train_content_seq-1:
            train_title_embedding[j][i] = stop_embedding
        else:
            train_title_embedding[j][i] = embeddings_index.get(word, np.zeros(embedding_size))
train_title_embedding.shape


In [None]:
test_content_seq = 256
test_content_embedding = np.zeros(shape=(len(test_content),test_content_seq,embedding_size))

for j,article in enumerate(test_content):
    for i,word in enumerate(article.split()):
        if i==test_content_seq:
            break
        if i==0:
            test_content_embedding[j][i] = start_embedding
        elif i==test_content_seq-1:
            test_content_embedding[j][i] = stop_embedding
        else:
            test_content_embedding[j][i] = embeddings_index.get(word, np.zeros(embedding_size))
test_content_embedding.shape


In [None]:
test_title_seq = 32
test_title_embedding = np.zeros(shape=(len(test_title),test_title_seq,embedding_size))

for j,title in enumerate(test_title):
    for i,word in enumerate(title.split()):
        if i==test_title_seq:
            break
        if i==0:
            test_title_embedding[j][i] = start_embedding
        elif i==test_content_seq-1:
            test_title_embedding[j][i] = stop_embedding
        else:
            test_title_embedding[j][i] = embeddings_index.get(word, np.zeros(embedding_size))
test_title_embedding.shape


In [None]:
train_title_labels = []

for title in train_title[0]:
    train_title_labels.append(title_vectorizer(title).numpy())

train_title_labels = np.array(train_title_labels).reshape(len(train_title),-1,1)

In [None]:
test_title_labels = []

for title in test_title:
    test_title_labels.append(title_vectorizer(title).numpy())
test_title_labels = np.array(test_title_labels).reshape(len(test_title),-1,1)

In [None]:
t = (title_vectorizer(train_title[0]).numpy())

In [None]:
title_word_rev_index[4152]

In [None]:
import pickle

with open('train_content_embeddings.pkl','wb+') as f:
    pickle.dump(train_content_embedding, f)

with open('train_title_embeddings.pkl','wb+') as f:
    pickle.dump(train_title_embedding, f)
    
with open('test_content_embeddings.pkl', 'wb+') as f:
    pickle.dump(test_content_embedding, f)

with open('test_title_embeddings.pkl','wb+') as f:
    pickle.dump(test_title_embedding, f)
    
with open('train_title_labels.pkl','wb+') as f:
    pickle.dump(train_title_labels, f)
    
with open('test_title_labels.pkl','wb+') as f:
    pickle.dump(test_title_labels, f)

In [None]:
title_vectorizer()