In [1]:
# ML
import torch
import openai

# NLP
import spacy 

# Utils 
import os 
import random
from pprint import pprint
from tqdm import tqdm


# Load Data

In [2]:
##########################################################################################################################################
# HELPER FUNCTIONS
##########################################################################################################################################

def load_articles(PATH):
    """
    Returns a list of strings, where each string is an article.
    Order is sorted by article name
    """

    news_files = sorted([f for f in os.listdir(PATH) if f.endswith('.txt')])
    

    news_list = []
    for file in news_files:
        with open(os.path.join(PATH, file), 'r') as f:
            text = f.read()
            news_list.append(text)
    
    return news_list

def load_triples(PATH):
    """
    Returns a nested list of triples by filename and triple
    Order is sorted by file name.
    """
    triple_files = sorted([f for f in os.listdir(PATH) if f.endswith('.txt')])
    # print(triple_files)

    triple_arr = []
    for file in triple_files:
        with open(os.path.join(PATH, file), 'r') as f:
            t = []
            lines = f.read().strip().split('\n')
            t.extend(lines)
        triple_arr.append(t)

    return triple_arr


def seperate_title_and_body(article, VERBOSE = False): 
    """
    Input: a string file
    Returns: list of format (title, [para1, para2, para3, ...])
    """

    paragraphs = article.split("\n\n")

    title = paragraphs[0]
    body = ' '.join(paragraphs[1:])
    if VERBOSE: 
        print("title:", title)
        print("body:", body)
    
    return (title, body)


##########################################################################################################################################
# EXECUTION
##########################################################################################################################################

FOLDER_TRIPLES = "data/BBC/Training_strict/business_triples"
FOLDER_RAW = "./data/BBC/News Articles/business"

# Load Triples and Raw Articles
all_triples = load_triples(FOLDER_TRIPLES)
articles = load_articles(FOLDER_RAW)

# Generate list of titles
titles = []
for article in articles:
    title, body = seperate_title_and_body(article, VERBOSE = False)
    titles.append(title)

# sanity Check
assert (len(all_triples) == len(titles) == len(articles)), "Counts do not match. (triples vs titles vs articles)"

id = random.randrange(0, len(titles))
print(f"Doc File: {id + 1}")
print(f"Title Sample: {titles[id]}")
print(f"Triple Sample: {all_triples[id]}")

Doc File: 382
Title Sample: Ban on forced retirement under 65
Triple Sample: ['employer | justify | force', 'employer | have | right', 'people | forced | work_long_than_people_want', 'business_leader | oppose | age_discrimination_proposal', 'British_Chambers | welcome | proposal', 'employer | have | ability', 'people | mount | challenge', 'worker | collect | worker_state_pension']


In [3]:
##########################################################################################################################################
# Inputs
##########################################################################################################################################

SYS_PROMPT = """You are a triplet-to-paragraph generator.

#Brief: 
Anything between [] is the task inputs.
# indicates the article title 
"sub | veb | obj" describes a series of verb, object triples related to title. 

#Task:
The task is to generate a short summary paragraph with the title at top. 
It should be factually based only on the triples. 
Inferences should only be made between the triples and the title. 
Do not add embellishments. 
Do organize the paragraph so it has a logical flow.
Keep it as simple and direct as possible. 

#Example Input:
[#China now top trader with Japan

china | overtook | us
change | highlights | chinagrowing importance
trade | was hurt | factors
analysts | see | spurs
Japan trade surplus | grew | trade
Japan trade surplus | accounted | trade

#Example Output:
China now top trader with Japan

China has overtaken the US as Japan's top trading partner. 
This change highlights China's growing importance in the region. 
Trade was hurt by various factors, but analysts see this as a spur to 
further growth. Japan's trade surplus grew as a result, with the surplus 
accounting for a significant portion of the trade."""

doc_prompts = []
for idx in range(len(all_triples)):
    triples = sorted(all_triples[idx])
    title = titles[idx]
    body = '\n'.join(triples)
    formatted_text = "[#" + title + "\n\n" + body + "]"
    # prompts.append(PROMPT + "\n\n==============\n\n" + formatted_text)
    doc_prompts.append(formatted_text)

print(f"\n=========Input Sys Prompt============\n{SYS_PROMPT}")
print(f"\n=========Input User Prompt============\n{doc_prompts[random.randrange(0, len(doc_prompts))]}")



You are a triplet-to-paragraph generator.

#Brief: 
Anything between [] is the task inputs.
# indicates the article title 
"sub | veb | obj" describes a series of verb, object triples related to title. 

#Task:
The task is to generate a short summary paragraph with the title at top. 
It should be factually based only on the triples. 
Inferences should only be made between the triples and the title. 
Do not add embellishments. 
Do organize the paragraph so it has a logical flow.
Keep it as simple and direct as possible. 

#Example Input:
[#China now top trader with Japan

china | overtook | us
change | highlights | chinagrowing importance
trade | was hurt | factors
analysts | see | spurs
Japan trade surplus | grew | trade
Japan trade surplus | accounted | trade

#Example Output:
China now top trader with Japan

China has overtaken the US as Japan's top trading partner. 
This change highlights China's growing importance in the region. 
Trade was hurt by various factors, but analysts see

# Generate Text

In [4]:
##########################################################################################################################################
# HELPER FUNCTIONS
##########################################################################################################################################

def query_gpt3_chat(sys_prompt, user_prompt, model_engine="gpt-3.5-turbo"):
    openai.api_key = os.environ.get("OPENAI_API_KEY")

    conversation = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt},
    ]

    response = openai.ChatCompletion.create(
        model=model_engine,
        messages=conversation
    )

    return response.choices[0].message['content'].strip()

##########################################################################################################################################
# TESTING
##########################################################################################################################################

prompt = doc_prompts[random.randrange(0, len(doc_prompts))] 
generated_text = query_gpt3_chat(sys_prompt= SYS_PROMPT, 
                                 user_prompt = prompt, 
                                 model_engine="gpt-3.5-turbo")
print(generated_text)

AuthenticationError: No API key provided. You can set your API key in code using 'openai.api_key = <API-KEY>', or you can set the environment variable OPENAI_API_KEY=<API-KEY>). If your API key is stored in a file, you can point the openai module at it with 'openai.api_key_path = <PATH>'. You can generate API keys in the OpenAI web interface. See https://onboard.openai.com for details, or email support@openai.com if you have any questions.

In [None]:
##########################################################################################################################################
# Generate Summaries
##########################################################################################################################################
import time

PAUSE = 10
PAUSE_ON_ERROR = 60
FOLDER = "./results/Generated_ChatGPTPrompt/"
if not os.path.exists(FOLDER):
    os.makedirs(FOLDER)

for idx, doc_prompt in tqdm(enumerate(doc_prompts)):
    
    # For error recovery
    # problems: 057, 067, 154
    if idx < 262: continue

    # Generate text
    try: 
        generated_text = query_gpt3_chat(sys_prompt= SYS_PROMPT, 
                                    user_prompt = doc_prompt, 
                                    model_engine="gpt-3.5-turbo")
    except:
        time.sleep(PAUSE_ON_ERROR)
        generated_text = query_gpt3_chat(sys_prompt= SYS_PROMPT, 
                                    user_prompt = doc_prompt, 
                                    model_engine="gpt-3.5-turbo")
        print(f"Paused and resolved on idx: {idx}")
     
    # Save summary as txt file
    filename = f"{idx + 1:03d}.txt"
    path = os.path.join(FOLDER, filename)
    with open(path, "w") as f:
        f.write(generated_text)
    
    # Rate limiting
    time.sleep(PAUSE)


# OLD