### Import necessary packages and libraries

In [1]:
import numpy as np
import tensorflow as tf
import pickle
import json
from preprocess import *
from prepare import *
from transformer import TransformerModel
from lossacc import masked_loss, masked_accuracy
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
import os


### Prepare Fox News and NYT data for modeling

1. Clean text
2. Collate data files
3. Build maps (optional)

In [2]:
input_files = ['../data/foxnews_content.json', '../data/nyt_content.json']
collate_file = '../data/nytfox_collate.json'

clean_text = [remove_char_encoding, remove_special_char, make_lowercase]
collate_data(input_files, save_to=collate_file, clean_text=clean_text)

Processed 16385 articles in file ../data/foxnews_content.json
Processed 10560 articles in file ../data/nyt_content.json
Saved to ../data/nytfox_collate.json


### Preprocess collated data

1. Build train-test split
2. Tokenize and vectorize train and test splits
3. Intialize embeddings based on glove 100d

In [3]:
np.random.seed(2470)
train_content, train_title, test_content, test_title = train_test_split(collate_file)
(content_vocab, content_word_index, content_index_word, 
 title_vocab, title_word_index, title_index_word) = vectorize_data(train_content, train_title)

train_content_vec = CONTENT_VECTORIZER(train_content)
train_title_vec = TITLE_VECTORIZER(train_title)
test_content_vec = CONTENT_VECTORIZER(test_content)
test_title_vec = TITLE_VECTORIZER(test_title)

print(train_content_vec.shape, train_title_vec.shape, test_content_vec.shape, test_title_vec.shape)

glove_index = build_glove_embed_index()
title_embedding_init, title_vocab_size = build_embedding_init(title_word_index, glove_index)
content_embedding_init, content_vocab_size = build_embedding_init(content_word_index, glove_index)


2023-05-11 23:19:59.102114: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


(25184, 256) (25184, 16) (1325, 256) (1325, 16)
Unique words in glove: 400003
Hits: 14315; Misses: 685
Hits: 68712; Misses: 21651


### Define architecture

In [4]:
num_layers = 2
num_heads = 8
ff_dim = 256
embedding_size = GLOVE_EMBED_SZ
content_window_size = CONTENT_SEQ_LEN
title_window_size = TITLE_SEQ_LEN
content_embedding_initializer = tf.keras.initializers.Constant(content_embedding_init)
title_embedding_initializer = tf.keras.initializers.Constant(title_embedding_init)
content_embedding_trainability = True
title_embedding_trainability = True
dropout_rate = 0.1

train_title_labels = train_title_vec[:,:,tf.newaxis]
test_title_labels = test_title_vec[:,:,tf.newaxis]

model = TransformerModel(num_layers, num_heads, ff_dim, embedding_size, content_vocab_size, title_vocab_size,
                         content_window_size, title_window_size, content_embedding_initializer, title_embedding_initializer,
                         content_embedding_trainability, title_embedding_trainability, dropout_rate)

model_name = 'modelv2-2blocks-8heads-256ffdim-trainableemb-30ep'
model.compile(optimizer='Adam', loss=masked_loss, metrics=[masked_accuracy])



'metrics=[masked_accuracy]'

### Train
(optional- use only if training new model; either to change architecture or update model weights)

In [5]:
model.fit(x=(train_content_vec, train_title_vec[:,:-1]), y=train_title_labels[:,1:], 
          batch_size=200, epochs=30)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x30a23bbb0>

#### Save model weights

(optional- use only if new model weights need to be saved)

In [6]:
model_weights_path = f'../weights/{model_name}'

def save_model_weights(filepath):
    if os.path.isfile(f'{filepath}.index'):
        confirmation = input('File exists; hit y to override: ')
        
        if confirmation.lower()=='y':
            model.save_weights(filepath)
        else:
            print('Not saving; try saving with different filename')
    else:
        model.save_weights(filepath)

# save_model_weights(model_weights_path)

#### Load model weights 
(optional- use only if testing custom model with different weights and same architecture)

In [7]:
# model_weights_path = f'../weights/{model_name}'
# model.load_weights(f'{model_weights_path}') 

## e.g. model.load_weights('../models/weights/modelv2-2blocks-5heads-256ffdim-trainableemb')



<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x30a30eec0>

### Inference

#### Setup functions for use in inference

In [8]:
def sentence_from_ind(indexes, index_word_dict=title_index_word):
    """Convenience function with no generalization- converts index to word from user defined dictionary"""
    sentence = ""
    for index in indexes:
        sentence += index_word_dict[index]
        sentence += " "
    return sentence


def reverse_bias(content):
    """Convenience function with no generalization- just a hack to reverse the bias"""
    words = content.split()
    view = words[1]
    
    if view=='liberal':
        words[1] = 'conservative'
    else:
        words[1] = 'liberal'
    reverse_bias_content = ' '.join(words)
    return reverse_bias_content, view, words[1]


def text_to_title(content, model=model, output_len=TITLE_SEQ_LEN, 
                  start_token=START_TOKEN, end_token=END_TOKEN):
    """Converts vectorized text to title
    Arguments:
        content - vectorized text"""
    
    start, end = (tf.constant(title_word_index[start_token], dtype=tf.int64), 
                  tf.constant(title_word_index[end_token], dtype=tf.int64))
    
    start = start[tf.newaxis]
    end = end[tf.newaxis]
    
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(output_len):
        output = tf.transpose(output_array.stack())
        predictions = model([content[tf.newaxis], output], training=False)
        
        # Select the last token from the `seq_len` dimension.
        predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.
        predicted_id = tf.argmax(predictions, axis=2)

        # Concatenate the `predicted_id` to the output which is given to the
        # decoder as its input.
        output_array = output_array.write(i+1, predicted_id[0])

        if predicted_id == end:
            break
        
    output = output_array.stack().numpy().reshape(1,-1)
    predicted_title = sentence_from_ind(output[0].tolist())
    return predicted_title



#### Reverse bias

In [9]:
# reverse bias of test file articles to gauge bias in titles; 
# titles are then compared for each political view for the same set of articles

test_reverse_content = []
test_original_view = []
test_reverse_view = []

for content in test_content:
    reverse_bias_content, original_view, reverse_view = reverse_bias(content)
    
    test_reverse_content.append(reverse_bias_content)
    test_original_view.append(original_view)
    test_reverse_view.append(reverse_view)
    
test_reverse_content_vec = CONTENT_VECTORIZER(test_reverse_content)


#### Run inference 

Content conditioned on original labels (i.e. as per source Fox => 'conservative' vs. NYT => 'liberal'), and reversed labels (i.e. opposite to original source Fox => 'liberal' vs. NYT => 'conservative')

In [10]:
# true_titles = []
# predicted_titles_original_bias = []
# predicted_titles_reverse_bias = []
# bleu_score_original_bias = []
# bleu_score_reverse_bias = []

# test_articles_len = len(test_content)

# for index in range(test_articles_len):
#     content_vec, reverse_content_vec, true_title = test_content_vec[index], test_reverse_content_vec[index], test_title[index]
#     predicted_title_original_bias = text_to_title(content_vec)
#     predicted_title_reverse_bias = text_to_title(reverse_content_vec)
    
#     true_titles.append(true_title)
#     predicted_titles_original_bias.append(predicted_title_original_bias)
#     predicted_titles_reverse_bias.append(predicted_title_reverse_bias)
    
#     bleu_score_original_bias.append(sentence_bleu([true_title.split()], predicted_title_original_bias.split(), 
#                                     weights=(1,0,0,0)))
#     bleu_score_reverse_bias.append(sentence_bleu([true_title.split()], predicted_title_reverse_bias.split(), 
#                                     weights=(1,0,0,0)))
#     if (index+1)%100==0:
#         print(f'Completed inference on {index+1} articles')

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Completed inference on 100 articles
Completed inference on 200 articles
Completed inference on 300 articles
Completed inference on 400 articles
Completed inference on 500 articles
Completed inference on 600 articles
Completed inference on 700 articles
Completed inference on 800 articles
Completed inference on 900 articles
Completed inference on 1000 articles
Completed inference on 1100 articles
Completed inference on 1200 articles
Completed inference on 1300 articles


#### Save results to csv

In [11]:
# results_path = f'../results/{model_name}-results.csv'
# df = pd.DataFrame(data=[true_titles, predicted_titles_original_bias, predicted_titles_reverse_bias,
#                         bleu_score_original_bias, bleu_score_reverse_bias,
#                         test_original_view[:test_articles_len], test_reverse_view[:test_articles_len]]).T
# df.columns = ['true_title','predicted_title_original_bias', 'predicted_title_reverse_bias',
#               'bleu_score_original_bias', 'bleu_score_reverse_bias',
#               'original_view', 'reverse_view']
# df['mean_bleu_score'] = (df['bleu_score_original_bias']+df['bleu_score_reverse_bias'])/2
# df.sort_values(by=['mean_bleu_score'],ascending=[False],inplace=True)
# df.to_csv(results_path, index=False)



In [12]:
test_content_vec, test_title_vec[:,:-1]

(<tf.Tensor: shape=(1325, 256), dtype=int64, numpy=
 array([[  23,   40,   25, ...,  249,    8,   24],
        [  23,   40,   25, ...,    2,  296,   24],
        [  23,   40,   25, ..., 3651, 2442,   24],
        ...,
        [  23,   63,   25, ...,  703,    2,   24],
        [  23,   40,   25, ..., 4662,  115,   24],
        [  23,   40,   25, ...,    2,   65,   24]])>,
 <tf.Tensor: shape=(1325, 15), dtype=int64, numpy=
 array([[    2,    14,   166, ...,    11, 10573,     3],
        [    2,  1842,   314, ...,   289,    44,   109],
        [    2,    16,  2976, ...,   622,     3,     0],
        ...,
        [    2,  2857,    22, ...,     0,     0,     0],
        [    2,    94,  1402, ...,    94,   949,    10],
        [    2,   889,   960, ...,  2681,  1918,     3]])>)

In [14]:
# model.fit(x=(train_content_vec, train_title_vec[:,:-1]), y=train_title_labels[:,1:], 
#           batch_size=20, epochs=15)


model.evaluate(x=[test_content_vec, test_title_vec[:,:-1]],y=test_title_vec[:,1:],verbose=2)


InvalidArgumentError: Graph execution error:

Detected at node 'masked_loss/mul' defined at (most recent call last):
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/traitlets/config/application.py", line 1041, in launch_instance
      app.start()
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 724, in start
      self.io_loop.start()
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/asyncio/base_events.py", line 1899, in _run_once
      handle._run()
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 512, in dispatch_queue
      await self.process_one()
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 501, in process_one
      await dispatch(*args)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 408, in dispatch_shell
      await result
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 731, in execute_request
      reply_content = await reply_content
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 417, in do_execute
      res = shell.run_cell(
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2945, in run_cell
      result = self._run_cell(
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3000, in _run_cell
      return runner(coro)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3203, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3382, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3442, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/hz/mq1t9krj4kv9cxjz4g5c_p8c0000gn/T/ipykernel_23006/1782201265.py", line 5, in <module>
      model.evaluate(x=[test_content_vec, test_title_vec[:,:-1]],y=test_title_vec[:,1:],verbose=2)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/keras/engine/training.py", line 2040, in evaluate
      tmp_logs = self.test_function(iterator)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/keras/engine/training.py", line 1820, in test_function
      return step_function(self, iterator)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/keras/engine/training.py", line 1804, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/keras/engine/training.py", line 1792, in run_step
      outputs = model.test_step(data)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/keras/engine/training.py", line 1758, in test_step
      self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/keras/engine/training.py", line 1082, in compute_loss
      return self.compiled_loss(
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/keras/losses.py", line 152, in __call__
      losses = call_fn(y_true, y_pred)
    File "/opt/miniconda3/envs/csci2470-project/lib/python3.10/site-packages/keras/losses.py", line 284, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/sagarraichandani/Documents/Brown/Academic/csci2470/project/text-to-title/src/lossacc.py", line 9, in masked_loss
      loss *= mask
Node: 'masked_loss/mul'
Incompatible shapes: [32,15] vs. [32,15,1]
	 [[{{node masked_loss/mul}}]] [Op:__inference_test_function_15914]

In [None]:
train_title_vec.shape[1]==test_title_vec.shape[1]