### Import necessary packages and libraries

In [1]:
import numpy as np
import tensorflow as tf
import pickle
import json
from preprocess import *
from prepare import *
from transformer import TransformerModel
from lossacc import masked_loss, masked_accuracy
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
import os


### Prepare Fox News and NYT data for modeling

1. Clean text
2. Collate data files
3. Build maps (optional)

In [2]:
input_files = ['../data/foxnews_content.json', '../data/nyt_content.json']
collate_file = '../data/nytfox_collate.json'

clean_text = [remove_char_encoding, remove_special_char, make_lowercase]
collate_data(input_files, save_to=collate_file, clean_text=clean_text)

Processed 16385 articles in file ../data/foxnews_content.json
Processed 10560 articles in file ../data/nyt_content.json
Saved to ../data/nytfox_collate.json


### Preprocess collated data

1. Build train-test split
2. Tokenize and vectorize train and test splits
3. Intialize embeddings based on glove 100d

In [3]:
np.random.seed(2470)
train_content, train_title, test_content, test_title = train_test_split(collate_file)
(content_vocab, content_word_index, content_index_word, 
 title_vocab, title_word_index, title_index_word) = vectorize_data(train_content, train_title)

train_content_vec = CONTENT_VECTORIZER(train_content)
train_title_vec = TITLE_VECTORIZER(train_title)
test_content_vec = CONTENT_VECTORIZER(test_content)
test_title_vec = TITLE_VECTORIZER(test_title)

print(train_content_vec.shape, train_title_vec.shape, test_content_vec.shape, test_title_vec.shape)

glove_index = build_glove_embed_index()
title_embedding_init, title_vocab_size = build_embedding_init(title_word_index, glove_index)
content_embedding_init, content_vocab_size = build_embedding_init(content_word_index, glove_index)


2023-05-11 18:26:00.625956: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


(25184, 256) (25184, 16) (1325, 256) (1325, 16)
Unique words in glove: 400003
Hits: 14315; Misses: 685
Hits: 68712; Misses: 21651


### Define architecture

In [4]:
num_layers = 2
num_heads = 8
ff_dim = 256
embedding_size = GLOVE_EMBED_SZ
content_window_size = CONTENT_SEQ_LEN
title_window_size = TITLE_SEQ_LEN
content_embedding_initializer = tf.keras.initializers.Constant(content_embedding_init)
title_embedding_initializer = tf.keras.initializers.Constant(title_embedding_init)
content_embedding_trainability = True
title_embedding_trainability = True
dropout_rate = 0.1

train_title_labels = train_title_vec[:,:,tf.newaxis]
test_title_labels = test_title_vec[:,:,tf.newaxis]

model = TransformerModel(num_layers, num_heads, ff_dim, embedding_size, content_vocab_size, title_vocab_size,
                         content_window_size, title_window_size, content_embedding_initializer, title_embedding_initializer,
                         content_embedding_trainability, title_embedding_trainability, dropout_rate)

model_name = 'modelv2-2blocks-8heads-256ffdim-trainableemb-15ep'
model.compile(optimizer='Adam', loss=masked_loss, metrics=[masked_accuracy])



### Train
(optional- use only if training new model; either to change architecture or update model weights)

In [5]:
# model.fit(x=(train_content_vec, train_title_vec[:,:-1]), y=train_title_labels[:,1:], 
#           batch_size=20, epochs=15)


#### Save model weights

(optional- use only if new model weights need to be saved)

In [6]:
model_weights_path = f'../weights/{model_name}'

def save_model_weights(filepath):
    if os.path.isfile(f'{filepath}.index'):
        confirmation = input('File exists; hit y to override: ')
        
        if confirmation.lower()=='y':
            model.save_weights(filepath)
        else:
            print('Not saving; try saving with different filename')
    else:
        model.save_weights(filepath)

# save_model_weights(model_weights_path)

#### Load model weights 
(optional- use only if testing custom model with different weights and same architecture)

In [7]:
model_weights_path = f'../weights/{model_name}'
model.load_weights(f'{model_weights_path}') 

## e.g. model.load_weights('../models/weights/modelv2-2blocks-5heads-256ffdim-trainableemb')



<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x2cbe7f010>

### Inference

#### Setup functions for use in inference

In [8]:
def sentence_from_ind(indexes, index_word_dict=title_index_word):
    """Convenience function with no generalization- converts index to word from user defined dictionary"""
    sentence = ""
    for index in indexes:
        sentence += index_word_dict[index]
        sentence += " "
    return sentence


def reverse_bias(content):
    """Convenience function with no generalization- just a hack to reverse the bias"""
    words = content.split()
    view = words[1]
    
    if view=='liberal':
        words[1] = 'conservative'
    else:
        words[1] = 'liberal'
    reverse_bias_content = ' '.join(words)
    return reverse_bias_content, view, words[1]


def text_to_title(content, model=model, output_len=TITLE_SEQ_LEN, 
                  start_token=START_TOKEN, end_token=END_TOKEN):
    """Converts vectorized text to title
    Arguments:
        content - vectorized text"""
    
    start, end = (tf.constant(title_word_index[start_token], dtype=tf.int64), 
                  tf.constant(title_word_index[end_token], dtype=tf.int64))
    
    start = start[tf.newaxis]
    end = end[tf.newaxis]
    
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(output_len):
        output = tf.transpose(output_array.stack())
        predictions = model([content[tf.newaxis], output], training=False)
        
        # Select the last token from the `seq_len` dimension.
        predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.
        predicted_id = tf.argmax(predictions, axis=2)

        # Concatenate the `predicted_id` to the output which is given to the
        # decoder as its input.
        output_array = output_array.write(i+1, predicted_id[0])

        if predicted_id == end:
            break
        
    output = output_array.stack().numpy().reshape(1,-1)
    predicted_title = sentence_from_ind(output[0].tolist())
    return predicted_title



#### Reverse bias

In [9]:
# reverse bias of test file articles to gauge bias in titles; 
# titles are then compared for each political view for the same set of articles

test_reverse_content = []
test_original_view = []
test_reverse_view = []

for content in test_content:
    reverse_bias_content, original_view, reverse_view = reverse_bias(content)
    
    test_reverse_content.append(reverse_bias_content)
    test_original_view.append(original_view)
    test_reverse_view.append(reverse_view)
    
test_reverse_content_vec = CONTENT_VECTORIZER(test_reverse_content)


#### Run inference 

Content conditioned on original labels (i.e. as per source Fox => 'conservative' vs. NYT => 'liberal'), and reversed labels (i.e. opposite to original source Fox => 'liberal' vs. NYT => 'conservative')

In [None]:
true_titles = []
predicted_titles_original_bias = []
predicted_titles_reverse_bias = []
bleu_score_original_bias = []
bleu_score_reverse_bias = []

test_articles_len = len(test_content)

for index in range(test_articles_len):
    content_vec, reverse_content_vec, true_title = test_content_vec[index], test_reverse_content_vec[index], test_title[index]
    predicted_title_original_bias = text_to_title(content_vec)
    predicted_title_reverse_bias = text_to_title(reverse_content_vec)
    
    true_titles.append(true_title)
    predicted_titles_original_bias.append(predicted_title_original_bias)
    predicted_titles_reverse_bias.append(predicted_title_reverse_bias)
    
    bleu_score_original_bias.append(sentence_bleu([true_title.split()], predicted_title_original_bias.split(), 
                                    weights=(1,0,0,0)))
    bleu_score_reverse_bias.append(sentence_bleu([true_title.split()], predicted_title_reverse_bias.split(), 
                                    weights=(1,0,0,0)))
    if (index+1)%100==0:
        print(f'Completed inference on {index+1} articles')

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Completed inference on 100 articles
Completed inference on 200 articles
Completed inference on 300 articles
Completed inference on 400 articles
Completed inference on 500 articles
Completed inference on 600 articles
Completed inference on 700 articles
Completed inference on 800 articles


#### Save results to csv

In [None]:
results_path = f'../results/{model_name}-results.csv'
df = pd.DataFrame(data=[true_titles, predicted_titles_original_bias, predicted_titles_reverse_bias,
                        bleu_score_original_bias, bleu_score_reverse_bias,
                        test_original_view[:test_articles_len], test_reverse_view[:test_articles_len]]).T
df.columns = ['true_title','predicted_title_original_bias', 'predicted_title_reverse_bias',
              'bleu_score_original_bias', 'bleu_score_reverse_bias',
              'original_view', 'reverse_view']
df['mean_bleu_score'] = (df['bleu_score_original_bias']+df['bleu_score_reverse_bias'])/2
df.sort_values(by=['mean_bleu_score'],ascending=[False],inplace=True)
df.to_csv(results_path, index=False)

