This notebook is to generate text from triplets using a trained T5 model

In [1]:
# ML
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# NLP
import spacy 

# Utils 
import os 
import random
from pprint import pprint
from tqdm import tqdm


# Load Data

In [2]:
##########################################################################################################################################
# HELPER FUNCTIONS
##########################################################################################################################################

def load_articles(PATH):
    """
    Returns a list of strings, where each string is an article.
    Order is sorted by article name
    """

    news_files = sorted([f for f in os.listdir(PATH) if f.endswith('.txt')])
    

    news_list = []
    for file in news_files:
        with open(os.path.join(PATH, file), 'r') as f:
            text = f.read()
            news_list.append(text)
    
    return news_list

def load_triples(PATH):
    """
    Returns a nested list of triples by filename and triple
    Order is sorted by file name.
    """
    triple_files = sorted([f for f in os.listdir(PATH) if f.endswith('.txt')])
    print(triple_files)

    triple_arr = []
    for file in triple_files:
        with open(os.path.join(PATH, file), 'r') as f:
            t = []
            lines = f.read().strip().split('\n')
            t.extend(lines)
        triple_arr.append(t)

    return triple_arr


def seperate_title_and_body(article, VERBOSE = False): 
    """
    Input: a string file
    Returns: list of format (title, [para1, para2, para3, ...])
    """

    paragraphs = article.split("\n\n")

    title = paragraphs[0]
    body = ' '.join(paragraphs[1:])
    if VERBOSE: 
        print("title:", title)
        print("body:", body)
    
    return (title, body)


##########################################################################################################################################
# EXECUTION
##########################################################################################################################################

FOLDER_TRIPLES = "data/BBC/Training_strict/business_triples"
FOLDER_RAW = "./data/BBC/News Articles/business"

# Load Triples and Raw Articles
all_triples = load_triples(FOLDER_TRIPLES)
articles = load_articles(FOLDER_RAW)

# Generate list of titles
titles = []
for article in articles:
    title, body = seperate_title_and_body(article, VERBOSE = False)
    titles.append(title)

# sanity Check
print(f"Number of triples: {len(all_triples)}")
print(f"Number of titles: {len(titles)}")
print(f"Number of articles: {len(articles)}")
print(f"Triple Sample: {all_triples[0]}")

['001.txt', '002.txt', '003.txt', '004.txt', '005.txt', '006.txt', '007.txt', '008.txt', '009.txt', '010.txt', '011.txt', '012.txt', '013.txt', '014.txt', '015.txt', '016.txt', '017.txt', '018.txt', '019.txt', '020.txt', '021.txt', '022.txt', '023.txt', '024.txt', '025.txt', '026.txt', '027.txt', '028.txt', '029.txt', '030.txt', '031.txt', '032.txt', '033.txt', '034.txt', '035.txt', '036.txt', '037.txt', '038.txt', '039.txt', '040.txt', '041.txt', '042.txt', '043.txt', '044.txt', '045.txt', '046.txt', '047.txt', '048.txt', '049.txt', '050.txt', '051.txt', '052.txt', '053.txt', '054.txt', '055.txt', '056.txt', '057.txt', '058.txt', '059.txt', '060.txt', '061.txt', '062.txt', '063.txt', '064.txt', '065.txt', '066.txt', '067.txt', '068.txt', '069.txt', '070.txt', '071.txt', '072.txt', '073.txt', '074.txt', '075.txt', '076.txt', '077.txt', '078.txt', '079.txt', '080.txt', '081.txt', '082.txt', '083.txt', '084.txt', '085.txt', '086.txt', '087.txt', '088.txt', '089.txt', '090.txt', '091.txt'

# Generate Text

In [3]:
model_path = "./models/T5_fulltrain_16epoch/"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)


In [4]:
##########################################################################################################################################
# HELPER FUNCTIONS
##########################################################################################################################################\

def generate_sentence(input_text, model, tokenizer, device='cpu'):
    model.eval()
    model.to(device)

    # input_text = "WebNLG: " + input_text + "</s>"
    input_text = "WebNLG: " + input_text
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=150, num_return_sequences=1)

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return decoded_output


##########################################################################################################################################
# PROCESS
##########################################################################################################################################


"""
dollar | hit | Dollar_level && Dollar | reach | 1.2871 && US_trade_deficit | set | stabilise
Greenspan | highlight | willingness && Greenspan_speech | send | Dollar
White_House | announce | House_budget && commentator | believe | House_budget
Greenspan | highlight | willingness && Greenspan_speech | send | Dollar
"""
input_text = "commentator | believe | House_budget"
output_text = generate_sentence(input_text, model, tokenizer)
print(output_text)


The book, called the House Dogget, is a sequel to the comic book character, Charlie.


In [5]:
# Generate a summary 
VERBOSE = False
FOLDER = "./results/Generated_T5_16epoch/"
if not os.path.exists(FOLDER):
    os.makedirs(FOLDER)

summaries = []
for id, doc_triples in tqdm(enumerate(all_triples)):
    
    # init 
    summary = []
    idx = 0

    # sort for subject match
    doc_triples = sorted(doc_triples)

    # add title to summary 
    summary.append(titles[id])
    summary.append("\n")

    # iterate through triples
    while idx < len(doc_triples):
        triple = doc_triples[idx]
        
        # look ahead 
        try: triple_n1 = doc_triples[idx+1]
        except: pass 
        try: triple_n2 = doc_triples[idx+2]
        except: pass 
        
        # base triple 
        predict = triple 

        # concatenate triples (rule + probabilistic)
        rand2 = random.randrange(0, 10)
        rand3 = random.randrange(0, 10)
        if triple[0] == triple_n1[0]:
            # concatenate
            predict = predict + " && " + triple_n1
            idx += 1 
            # if rand3 < 2: 
            #     predict = predict + " && " + triple_n2
            #     idx += 1
        # if rand2 < 3:
        #     predict = predict + " && " + triple_n1
        #     idx += 1 
        #     if rand3 < 2: 
        #         predict = predict + " && " + triple_n2
        #         idx += 1
        idx += 1 
        
        # end function 
        generated_sentence = generate_sentence(predict, model, tokenizer)
        
        if VERBOSE: 
            print(f"predict: {predict}")
            print(f"generated: {generated_sentence}")


        summary.append(generated_sentence)
    
    # convert summary to string
    summary = ' '.join(summary)
    
    # save summary as txt file
    filename = f"{id + 1:03d}.txt"
    path = os.path.join(FOLDER, filename)

    with open(path, "w") as f:
        f.write(summary)



11it [00:43,  3.98s/it]


KeyboardInterrupt: 

# OLD