In [1]:
import pandas as pd
import openai
import numpy as np
import pickle
from transformers import GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# ENTER API KEY
openai.api_key = ""


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# We have hosted the processed dataset, so you can download it directly without having to recreate it.
# This dataset has already been split into sections, one row for each section of the Wikipedia page.

df = pd.read_csv('https://cdn.openai.com/API/examples/data/olympics_sections_text.csv')
df = df.set_index(["title", "heading"])

# Slice the dataset for faster and cheaper training
df = df[:1]

print(f"{len(df)} rows in the data.")
print(f"{sum(df.tokens)} tokens in the data")

1 rows in the data.
726 tokens in the data


In [3]:
header = []
for i in range(df.shape[0]):
    h = df.iloc[0].name[0] + "\n" + df.iloc[0].name[1] + "\n\n"
    header.append(h)
    
df['header'] = header
df['context'] = df['header'] + df['content']    

In [4]:
def get_questions(context):
    try:
        response = openai.Completion.create(
#             engine='curie-search-query',
            engine = 'text-davinci-002',
            prompt=f"Write five questions based on the text below\n\nText: {context}\n\nQuestions:\n1.",
            temperature=0.5,
            max_tokens=257, # originally 257
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        return "1." + response['choices'][0]['text']
    except:
        return ""


def get_answers(row):
    try:
        response = openai.Completion.create(
            engine='text-davinci-002',
            prompt=f"""Answer the questions based on the text below.\n\nText: {row.context}\n\nQuestions:\n{row.new_questions}\n\nAnswers:\n1.""",
            temperature=0,
            max_tokens=257,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
#         print(f"""Answer the questions based on the text below.\n\nText: {row.context}\n\nQuestions:\n{row.new_questions}\n\nAnswers:\n1.""")
#         print(response['choices'][0]['text'])
        return "1." + response['choices'][0]['text']
    except Exception as e:
        print (e)
        return ""


In [5]:
print("Contents of the dataset used for fine-tuning:\n\n")
print(df['context'][0])

Contents of the dataset used for fine-tuning:


2020 Summer Olympics
Summary

The 2020 Summer Olympics (Japanese: 2020年夏季オリンピック, Hepburn: Nisen Nijū-nen Kaki Orinpikku), officially the Games of the XXXII Olympiad (第三十二回オリンピック競技大会, Dai Sanjūni-kai Orinpikku Kyōgi Taikai) and also known as Tokyo 2020 (東京2020, Tōkyō Nii Zero Nii Zero), was an international multi-sport event held from 23 July to 8 August 2021 in Tokyo, Japan, with some preliminary events that began on 21 July.
Tokyo was selected as the host city during the 125th IOC Session in Buenos Aires, Argentina, on 7 September 2013. The Games were originally scheduled to take place from 24 July to 9 August 2020, but due to the global COVID-19 pandemic, on 24 March 2020, the event was postponed to 2021, the first such instance in the history of the Olympic Games (previous games had been cancelled but not rescheduled). However, the event retained the Tokyo 2020 branding for marketing purpose. It was largely held behind closed doors wit

In [6]:
# avoiding re-training by default
train = False


def simple_dataset(df):
    """
    Create a dataset for fine tuning the OpenAI model

    Parameters
    ----------
    df: pd.DataFrame
        The dataframe containing the question, answer and context pairs

    Returns
    -------
    pd.DataFrame
        The dataframe containing the prompts and completions, ready for fine-tuning
    """
    rows = []
    for i, row in df.iterrows():
        for q, a in zip((row.questions).split('\n'), (row.answers).split('\n')):
            if len(q) >10 and len(a) >10:
                rows.append({"prompt": f" {q[2:].strip()}\n\n###\n\n", "completion":f"{a[2:].strip()}###"})

    return pd.DataFrame(rows) 


if train:
    # Loop n times to get more Q and A
    df['new_questions']= df.context.apply(get_questions)
    df['questions'] = df.new_questions
    df['new_answers']= df.apply(get_answers, axis=1)
    df['answers'] =  df.new_answers

    for i in range(19):
        df['new_questions']= df.context.apply(get_questions)
        df['questions'] = df.questions + "\n" + df.new_questions
        df['new_answers']= df.apply(get_answers, axis=1)
        df['answers'] = df.answers + "\n" + df.new_answers

    df.to_csv('custom_olympics_qa_firstpara.csv', index=False)
    
    # create the dataset in the appropriate format
    ft = simple_dataset(df)
    ft.to_json('qa.jsonl', orient='records', lines=True)

In [7]:
# read the saved questions and answeres
df = pd.read_csv('custom_olympics_qa_firstpara.csv')

In [8]:
# if interested, can print and examine the questions and answers that were generated

# print(df['questions'][0])
# print('------------')
# print(df['answers'][0])

In [9]:
# easier to directly run this from the terminal for monitoring purposes

# running from the notebook
#!openai api fine_tunes.create -t "qa.jsonl" --batch_size 16 

In [10]:
def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

COST_PER_COMPLETION = 0.02*1e-3

In [11]:
# Use the model

# Enter the fine-tuned model id
FINE_TUNED_MODEL = ""


def use_model(query):
    try:
        
        p = f""" {query}\n\n###\n\n"""
        
        response = openai.Completion.create(
            engine=FINE_TUNED_MODEL,
            prompt=p,
            temperature=0,
            max_tokens=257,
            top_p=1,
            frequency_penalty=0.9,
            presence_penalty=0,
            stop=["###"]
        )
        
        total_tokens = count_tokens(p) + count_tokens(response["choices"][0]["text"].strip(" \n"))
#         print(f"${total_tokens*COST_PER_COMPLETION:.5f} for answering prompt")
        
        return response['choices'][0]['text']
    except Exception as e:
        print (e)
        return ""



In [12]:
use_model("Which country won the most medals in the 2020 olympics?")

'The United States won the most medals in the 2020 olympics.'

In [13]:
use_model("Where was the most expensive olympic games held?")

'The most expensive olympic games were held in Tokyo in 2020.'

In [14]:
use_model("Which is the only city in asia to have hosted the olympics more than once?")

'Tokyo is the only city in Asia to have hosted the Olympics more than once.'

In [15]:
use_model("Which new events were introduced for the 2020 Summer Olympics?")

'New events introduced for the 2020 Summer Olympics include 3x3 basketball, freestyle BMX and mixed gender team events in a number of existing sports, as well as the return of madison cycling for men and an introduction of the same event for women.'