In [1]:
import torch
import os 
import spacy 

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from pprint import pprint
from tqdm import tqdm


# Load Data

In [2]:
##########################################################################################################################################
# HELPER FUNCTIONS
##########################################################################################################################################

def load_articles(PATH):
    """
    Returns a list of strings, where each string is an article.
    Order is sorted by article name
    """

    news_files = sorted([f for f in os.listdir(PATH) if f.endswith('.txt')])
    

    news_list = []
    for file in news_files:
        with open(os.path.join(PATH, file), 'r') as f:
            text = f.read()
            news_list.append(text)
    
    return news_list

def load_triples(PATH):
    """
    Returns a nested list of triples by filename and triple
    Order is sorted by file name.
    """
    triple_files = sorted([f for f in os.listdir(PATH) if f.endswith('.txt')])
    print(triple_files)

    triple_arr = []
    for file in triple_files:
        with open(os.path.join(PATH, file), 'r') as f:
            t = []
            lines = f.read().strip().split('\n')
            t.extend(lines)
        triple_arr.append(t)

    return triple_arr


def seperate_title_and_body(article, VERBOSE = False): 
    """
    Input: a string file
    Returns: list of format (title, [para1, para2, para3, ...])
    """

    paragraphs = article.split("\n\n")

    title = paragraphs[0]
    body = ' '.join(paragraphs[1:])
    if VERBOSE: 
        print("title:", title)
        print("body:", body)
    
    return (title, body)


##########################################################################################################################################
# EXECUTION
##########################################################################################################################################

FOLDER_TRIPLES = "./data/BBC/Training/business_triples"
FOLDER_RAW = "./data/BBC/News Articles/business"

# Load Triples and Raw Articles
triples = load_triples(FOLDER_TRIPLES)
articles = load_articles(FOLDER_RAW)

# Generate list of titles
titles = []
for article in articles:
    title, body = seperate_title_and_body(article, VERBOSE = False)
    titles.append(title)

# sanity Check
print(f"Number of triples: {len(triples)}")
print(f"Number of titles: {len(titles)}")
print(f"Number of articles: {len(articles)}")
print(f"Triple Sample: {triples[1]}")

['001.txt', '002.txt', '003.txt', '004.txt', '005.txt', '006.txt', '007.txt', '008.txt', '009.txt', '010.txt', '011.txt', '012.txt', '013.txt', '014.txt', '015.txt', '016.txt', '017.txt', '018.txt', '019.txt', '020.txt', '021.txt', '022.txt', '023.txt', '024.txt', '025.txt', '026.txt', '027.txt', '028.txt', '029.txt', '030.txt', '031.txt', '032.txt', '033.txt', '034.txt', '035.txt', '036.txt', '037.txt', '038.txt', '039.txt', '040.txt', '041.txt', '042.txt', '043.txt', '044.txt', '045.txt', '046.txt', '047.txt', '048.txt', '049.txt', '050.txt', '051.txt', '052.txt', '053.txt', '054.txt', '055.txt', '056.txt', '057.txt', '058.txt', '059.txt', '060.txt', '061.txt', '062.txt', '063.txt', '064.txt', '065.txt', '066.txt', '067.txt', '068.txt', '069.txt', '070.txt', '071.txt', '072.txt', '073.txt', '074.txt', '075.txt', '076.txt', '077.txt', '078.txt', '079.txt', '080.txt', '081.txt', '082.txt', '083.txt', '084.txt', '085.txt', '086.txt', '087.txt', '088.txt', '089.txt', '090.txt', '091.txt'

# Generate Text

In [3]:
# Load Models 
MODEL_PATH = "models/gpt2_medium_trained_pkg"

tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH)
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH)


In [4]:
##########################################################################################################################################
# HELPER FUNCTIONS
##########################################################################################################################################

def generate_sentence(model, tokenizer, input_triplet, max_length=100, temperature=0.1, no_repeat_ngram_size=2):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    input_text = input_triplet + " <==>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    model = model.to(device)
    output = model.generate(input_ids, 
                            max_length=max_length, 
                            num_return_sequences=1, 
                            attention_mask=attention_mask,
                            temperature=temperature,
                            no_repeat_ngram_size=no_repeat_ngram_size,
                            do_sample=True,
                            pad_token_id=tokenizer.eos_token_id  # Add this line to suppress the warning
                            )
    generated_text = tokenizer.decode(output[0])
    generated_sentence = generated_text.replace(input_text, "").strip()
    
    first_sentence = generated_sentence.split(".")[0] + "."

    return first_sentence


##########################################################################################################################################
# PROCESS
##########################################################################################################################################

# Generate a summary 
FOLDER = "./results/Generated_GPT2/"
VERBOSE = False

summaries = []
for idx, triplets in tqdm(enumerate(triples)): 
    if VERBOSE: 
        print("\n======Title======\n", titles[idx])
        print(idx, trips)

    summary = []

    # add title to summary 
    summary.append(titles[idx])
    summary.append("\n")
    
    # generate sentences for each triple
    for triplet in triplets:
        
        if triplet == "":
            continue
        generated_sentence = generate_sentence(model, tokenizer, triplet, max_length=100, temperature=0.1, no_repeat_ngram_size=2)
        summary.append(generated_sentence)

        if VERBOSE: 
            print(f"Trip: {triplet}")
            print(f"Sent: {generated_sentence}")
    
    # convert summary to string
    summary = ' '.join(summary)
    
    # save summary as txt file
    filename = f"{idx + 1:03d}.txt"
    path = os.path.join(FOLDER, filename)

    with open(path, "w") as f:
        f.write(summary)


8it [19:18, 144.82s/it]


KeyboardInterrupt: 

# Experiments

In [None]:
stop
# TEST - to generate multiple options and choose the one most similar to the semantic reference
# Generate 5 and pick the one with most similar vector to the input


##########################################################################################################################################
# HELPER FUNCTIONS
##########################################################################################################################################
import spacy
from pprint import pprint

def generate_multiple_sentences(model, tokenizer, input_triplet, device="cpu", max_length=100, temperature=0.3, no_repeat_ngram_size=2, num_sentences=3):
    input_text = input_triplet + " <==>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    output = model.generate(input_ids, 
                            max_length=max_length, 
                            num_return_sequences=num_sentences,
                            attention_mask=attention_mask,
                            temperature=temperature,
                            no_repeat_ngram_size=no_repeat_ngram_size,
                            do_sample=True,
                            )

    generated_sentences = []
    for idx in range(num_sentences):
        generated_text = tokenizer.decode(output[idx])
        generated_sentence = generated_text.replace(input_text, "").strip()
        generated_sentences.append(generated_sentence.split(".")[0]  + ".")

    return generated_sentences

def find_most_similar_sentence(semantic_ref, generated_sentences):
    input_vector = nlp(semantic_ref)
    
    similarities = []
    for sentence in generated_sentences:
        sentence_vector = nlp(sentence)
        similarities.append(input_vector.similarity(sentence_vector))
    
    most_similar_idx = similarities.index(max(similarities))
    
    return generated_sentences[most_similar_idx]


##########################################################################################################################################
# PROCESS
##########################################################################################################################################

IDX = 503 
input_triplets = triples[IDX]
semantic_ref = titles[IDX]
nlp = spacy.load("en_core_web_md")

summary = []
for input_triplet in input_triplets: 
    if t == "":
        continue
    generated_sentences = generate_multiple_sentences(model, tokenizer, input_triplet)
    most_similar_sentence = find_most_similar_sentence(semantic_ref, generated_sentences)
    summary.append(most_similar_sentence)

    print()
    print("in: ", input_triplet)
    print("title: ", semantic_ref)
    print("options: ")
    pprint(generated_sentences)
    print("selected: ", most_similar_sentence)

print("\n ===SUMMARY====")
print(' '.join(summary))


'''

in:  majority | are expected | to accept its $ 102.6bn ( £ 53.51bn ) debt restructuring offer for bondholders
title:  Aids and climate top Davos agenda
options: 
['The offer, made up of cash, shares and dividends, is expected to be accepted '
 'by at least 95% of the voting bondholder groups, according to the filing.',
 'The offer, made up of cash, shares and dividends, is expected to be accepted '
 'by at least 95% of the voting bondholder groups on the 15-member group of '
 'creditors.',
 'The offer, made up of cash, shares and dividends, is expected to be accepted '
 'by at least 95% of the voting bondholder groups, according to the filing.']
selected:  The offer, made up of cash, shares and dividends, is expected to be accepted by at least 95% of the voting bondholder groups, according to the filing.

in:  argentina | started | swap
title:  Aids and climate top Davos agenda
options: 
['Argentina started the swap with its neighbour on 1 December, clearing $27bn '
 'in foreign exchange.',
 'Argentina started the swap with Brazil last year and expects to see interest '
 'repayments start next month.',
 'Argentina started the swap last year and expects to see interest repayments '
 'start next month.']
selected:  Argentina started the swap with Brazil last year and expects to see interest repayments start next month.


in:  sorting out its debt | would enhance | argentina credibility
title:  Aids and climate top Davos agenda
options: 
["Sorting Out Its Debt would Enhance Argentina's Economic and Social "
 'Credibility.',
 "Sorting Out Its Debt would Enhance Argentina's Economic and Social "
 'Credibility.',
 "Sorting Out Its Debt would Enhance Argentina's Economic and Social "
 'Co-operation, which is at its highest since 1945.']
selected:  Sorting Out Its Debt would Enhance Argentina's Economic and Social Credibility.

in:  sorting out its debt | enable | argentina
title:  Aids and climate top Davos agenda
options: 
['"The restructuring of Argentina\'s debt enables Argentina to move in the '
 'direction of a more competitive economy," Mr Barrioneuvo said.',
 "Argentina's government can sort out the debt problem by allowing private "
 'companies to sell bonds, a policy that enables Argentina to reduce its huge '
 'current account deficit.',
 "Argentina's debt restructuring enables it to sort out the country's massive "
 'public debt, enabling it, in turn, to reduce its reliance on foreign '
 'lenders.']
selected:  Argentina's government can sort out the debt problem by allowing private companies to sell bonds, a policy that enables Argentina to reduce its huge current account deficit.

 ===SUMMARY====
The offer, made up of cash, shares and dividends, is expected to be accepted by at least 95% of the voting bondholder groups, 
according to the filing. Argentina started the swap with Brazil last year and expects to see interest repayments start next month. 
Sorting Out Its Debt would Enhance Argentina's Economic and Social Credibility. Argentina's government can sort out the debt problem 
by allowing private companies to sell bonds, a policy that enables Argentina to reduce its huge current account deficit.
'''

In [None]:
# Debugging for empty triples

print(triples[501])