In [1]:
import torch
import os 
import spacy 

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from pprint import pprint
from tqdm import tqdm


# Load Data

In [2]:
##########################################################################################################################################
# HELPER FUNCTIONS
##########################################################################################################################################

def load_articles(PATH):
    """
    Returns a list of strings, where each string is an article.
    Order is sorted by article name
    """

    news_files = sorted([f for f in os.listdir(PATH) if f.endswith('.txt')])
    

    news_list = []
    for file in news_files:
        with open(os.path.join(PATH, file), 'r') as f:
            text = f.read()
            news_list.append(text)
    
    return news_list

def load_triples(PATH):
    """
    Returns a nested list of triples by filename and triple
    Order is sorted by file name.
    """
    triple_files = sorted([f for f in os.listdir(PATH) if f.endswith('.txt')])
    print(triple_files)

    triple_arr = []
    for file in triple_files:
        with open(os.path.join(PATH, file), 'r') as f:
            t = []
            lines = f.read().strip().split('\n')
            t.extend(lines)
        triple_arr.append(t)

    return triple_arr


def seperate_title_and_body(article, VERBOSE = False): 
    """
    Input: a string file
    Returns: list of format (title, [para1, para2, para3, ...])
    """

    paragraphs = article.split("\n\n")

    title = paragraphs[0]
    body = ' '.join(paragraphs[1:])
    if VERBOSE: 
        print("title:", title)
        print("body:", body)
    
    return (title, body)


##########################################################################################################################################
# EXECUTION
##########################################################################################################################################

FOLDER_TRIPLES = "./data/BBC/Training/business_triples"
FOLDER_RAW = "./data/BBC/News Articles/business"

# Load Triples and Raw Articles
triples = load_triples(FOLDER_TRIPLES)
articles = load_articles(FOLDER_RAW)

# Generate list of titles
titles = []
for article in articles:
    title, body = seperate_title_and_body(article, VERBOSE = False)
    titles.append(title)

# sanity Check
print(f"Number of triples: {len(triples)}")
print(f"Number of titles: {len(titles)}")
print(f"Number of articles: {len(articles)}")
print(f"Triple Sample: {triples[0]}")

['001.txt', '002.txt', '003.txt', '004.txt', '005.txt', '006.txt', '007.txt', '008.txt', '009.txt', '010.txt', '011.txt', '012.txt', '013.txt', '014.txt', '015.txt', '016.txt', '017.txt', '018.txt', '019.txt', '020.txt', '021.txt', '022.txt', '023.txt', '024.txt', '025.txt', '026.txt', '027.txt', '028.txt', '029.txt', '030.txt', '031.txt', '032.txt', '033.txt', '034.txt', '035.txt', '036.txt', '037.txt', '038.txt', '039.txt', '040.txt', '041.txt', '042.txt', '043.txt', '044.txt', '045.txt', '046.txt', '047.txt', '048.txt', '049.txt', '050.txt', '051.txt', '052.txt', '053.txt', '054.txt', '055.txt', '056.txt', '057.txt', '058.txt', '059.txt', '060.txt', '061.txt', '062.txt', '063.txt', '064.txt', '065.txt', '066.txt', '067.txt', '068.txt', '069.txt', '070.txt', '071.txt', '072.txt', '073.txt', '074.txt', '075.txt', '076.txt', '077.txt', '078.txt', '079.txt', '080.txt', '081.txt', '082.txt', '083.txt', '084.txt', '085.txt', '086.txt', '087.txt', '088.txt', '089.txt', '090.txt', '091.txt'

# Generate Text

In [3]:
# Load Models 
MODEL_PATH = "./models/distilgpt2_trained_pkg"

tokenizer = GPT2Tokenizer.from_pretrained(MODEL_PATH)
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH)

In [4]:
##########################################################################################################################################
# HELPER FUNCTIONS
##########################################################################################################################################


def generate_sentence(model, tokenizer, input_triplet, max_length=100, temperature=0.1, no_repeat_ngram_size=2):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    input_text = input_triplet + " <==>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    model = model.to(device)
    output = model.generate(input_ids, 
                            max_length=max_length, 
                            num_return_sequences=1, 
                            attention_mask=attention_mask,
                            temperature=temperature,
                            no_repeat_ngram_size=no_repeat_ngram_size,
                            do_sample=True,
                            pad_token_id=tokenizer.eos_token_id  # Add this line to suppress the warning
                            )
    generated_text = tokenizer.decode(output[0])
    generated_sentence = generated_text.replace(input_text, "").strip()
    
    first_sentence = generated_sentence.split(".")[0] + "."

    return first_sentence


##########################################################################################################################################
# PROCESS
##########################################################################################################################################
'''
profit quarterly profits | were buoyed | gains users
'''

triplet = 'profit quarterly profits | were buoyed | gains users'
generated_sentence = generate_sentence(model, tokenizer, triplet, max_length=100, temperature=0.7, no_repeat_ngram_size=2)
print(generated_sentence)

Sales of mobile phones and computer software were boosted by the high number of users who signed up to the free trial.


In [5]:
# Generate N and pick the one with most similar vector to the input

##########################################################################################################################################
# HELPER FUNCTIONS
##########################################################################################################################################

def generate_multiple_sentences(model, tokenizer, input_triplet, device="cpu", max_length=50, temperature=0.3, no_repeat_ngram_size=2, num_sentences=5):
    input_text = input_triplet + " <==>"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    output = model.generate(input_ids, 
                            max_length=max_length, 
                            num_return_sequences=num_sentences,
                            attention_mask=attention_mask,
                            temperature=temperature,
                            no_repeat_ngram_size=no_repeat_ngram_size,
                            do_sample=True,
                            pad_token_id=tokenizer.eos_token_id  # Add this line to suppress the warning
                            )

    generated_sentences = []
    for idx in range(num_sentences):
        generated_text = tokenizer.decode(output[idx])
        generated_sentence = generated_text.replace(input_text, "").strip()
        generated_sentences.append(generated_sentence.split(".")[0]  + ".")

    return generated_sentences

def find_most_similar_sentence(semantic_ref, generated_sentences):
    input_vector = nlp(semantic_ref)
    
    similarities = []
    for sentence in generated_sentences:
        sentence_vector = nlp(sentence)
        similarities.append(input_vector.similarity(sentence_vector))
    
    most_similar_idx = similarities.index(max(similarities))
    
    return generated_sentences[most_similar_idx]


##########################################################################################################################################
# PROCESS
##########################################################################################################################################

VERBOSE = False 
FOLDER = "./results/Generated_DistilGPT_filtered"
if not os.path.exists(FOLDER):
    os.makedirs(FOLDER)

# load NLP model
nlp = spacy.load("en_core_web_md")

for idx, input_triplets in tqdm(enumerate(triples)):
    
    # Init 
    summary = []
    semantic_ref = titles[idx]

    # add title to summary 
    summary.append(titles[idx])
    summary.append("\n")

    for input_triplet in input_triplets: 
        
        # Skip empty triplets 
        if input_triplet == "":
            continue
        
        # Generate and filter
        generated_sentences = generate_multiple_sentences(model, tokenizer, input_triplet, device="cpu", max_length=50, temperature=0.3, no_repeat_ngram_size=2, num_sentences=3)
        most_similar_sentence = find_most_similar_sentence(semantic_ref, generated_sentences)
        summary.append(most_similar_sentence)

        if VERBOSE: 
            print()
            print("in: ", input_triplet)
            print("title: ", semantic_ref)
            print("options: ")
            pprint(generated_sentences)
            print("selected: ", most_similar_sentence)

    # convert to string 
    summary = ' '.join(summary)
    
    # save summary as txt file
    filename = f"{idx + 1:03d}.txt"
    path = os.path.join(FOLDER, filename)
    with open(path, "w") as f:
        f.write(summary)
    


'''
in:  climate change fight | are leading | list
title:  Aids and climate top Davos agenda
options: 
['The Paris climate change campaign is led by the Paris Club of wealthy '
 'countries, with rich countries such as Indonesia, Sri Lanka, and Sri Lank '
 'suffering the worst effects.',
 '"The overwhelming evidence suggests that the global climate change battle '
 'are continuing and that we need to take action to combat this vicious cycle '
 'of extreme poverty and extreme inequality," said Tim Crawford, a senior '
 'fellow.',
 "The US and China are among the world's leading economies to embrace global "
 'change, the Paris climate change battle is leading the list.',
 'The Paris-based group warned that the Paris climate change battle are '
 '"leading to a global food crisis".',
 '"The Paris climate change battle are the driving forces behind the Paris '
 'Agreement," said Jean-Francois Veron, a Paris economist who has been '
 'studying the negotiations.']
selected:  The US and China are among the world's leading economies to embrace global change, the Paris climate change battle is leading the list.

'''

0it [00:00, ?it/s]Input length of input_ids is 52, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
1it [00:35, 35.49s/it]


KeyboardInterrupt: 

In [None]:
# Debugging for empty triples

print(triples[501])