In [54]:
import backoff
import glob
import openai
import re
import json
from tqdm import tqdm
import pandas as pd
from typing import List

In [39]:
# use "export OPENAI_API_KEY=<key>" in terminal
openai.api_key = 'sk-<your api key>'

In [40]:
@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def completions_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

In [41]:
topic2notes = {}
notes_files = glob.glob('../notes/*.txt')
for file in notes_files:
    notes_topic = re.findall('../notes/([\w\-]+).txt', file)[0]
    topic2notes[notes_topic] = open(file, 'r').read().strip()

In [42]:
df = pd.read_csv('../notes/notes2subject.csv', sep='\t')
topic2subject = dict(zip(df['topic'], df['description']))

In [43]:
TEMPLATE_FILE = '../prompts/make_flashcards_v0.txt'
template = open(TEMPLATE_FILE, 'r').read()

In [44]:
def split_notes(notes: str, word_limit: int=300) -> List[str]:
    """Split a set of notes by word count."""
    lines = notes.split('\n')
    note_pieces = []
    num_words = 0
    note = ""
    for line in lines:
        words = len(line.split())
        if num_words + words > word_limit:
            note_pieces.append(note)
            note, num_words = "", 0
        note += f'\n{line}'
        num_words += words
    return note_pieces

In [45]:
example_notes = list(topic2notes.values())[0]
example_split_notes = split_notes(example_notes)
test_notes = example_split_notes[1]

In [55]:
TOKEN_LIMIT = 1000
MODEL = 'text-davinci-003'
for topic, notes in tqdm(topic2notes.items()):
    subject = topic2subject[topic]
    flashcards = {}
    for note_split in split_notes(notes):
        prompt = template.format(
            subject=subject,
            notes=note_split,
        )
        completion = completions_with_backoff(
            prompt=prompt,
            temperature=0,
            model=MODEL,
            max_tokens=TOKEN_LIMIT
        )
        flashcard_text = completion['choices'][0]['text']
        for line in flashcard_text.split('\n'):
            if not line: continue
            splits = line.split(' - ')
            term = splits[0]
            definition = ' - '.join(splits[1:])
            flashcards[term] = definition
    filename = f'../outputs/flashcards_v0_{topic}.json'
    json.dump(flashcards, open(filename, 'w'))

 20%|██        | 2/10 [06:11<24:44, 185.59s/it]


KeyboardInterrupt: 