**OpenAI**

In [221]:
import os
import openai
import logging

openai.organization = 'org-ZSjLwLfwPKSKiv1oL1veHDLb'
openai.api_key = os.getenv("OPENAI_API_KEY")

logging.basicConfig(filename='responses.log', 
                    filemode='w', 
                    level=logging.INFO, 
                    format='%(levelname)s:%(asctime)s %(message)s', 
                    datefmt='%m-%d-%Y %H:%M:%S')

# GPT prompt-response wrapper
def GPT(messages: list) -> str:
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo", 
        temperature=0.3,
        messages=messages
    )

    reply = response.choices[0].message.content
    logging.info(messages)
    logging.info(reply)

    return reply

**Gathering Wiki Pages**

Extract text 

In [222]:
import wikipediaapi as wiki_api

wiki = wiki_api.Wikipedia('en')

def extract_wiki_text(sections: wiki_api.WikipediaPageSection, blacklist: set={'See also', 'References', 'Notes', 'External links', 'Bibliography', 'Further reading'}, sec_num=None) -> dict:
    """Recursively extracts text from sections and subsections."""
    # omit acknowledgements, further reading, etc.
    valid_sections = {s.title for s in sections}^blacklist
    passage = {}

    for s in sections:
        num = sections.index(s)+1
        if s.title in valid_sections:
            if sec_num:
                uid = f'{sec_num}.{num}'
            else:
                uid = f'{num}'
            passage.update({uid: {"title": s.title, "text":s.text}})

        p = extract_wiki_text(s.sections, blacklist, sec_num=uid)
        passage.update(p)
    
    return passage

**Knowledge graph construction**

Prompt GPT-3.5 to extract keywords and triplets from passage sequentially. The format of the output varies, so there are a few exception catches:

1. The output is a markdown-formatted JSON
2. The output is JSON as-is
3. The output is an unformatted list

In [223]:
import json
import re

def extract_info(passage: str) -> tuple:
    """Extract keywords and triplets from passage."""
    context = (f'This is an information extraction task; only perform the assigned tasks'
               f' and adhere to the given formatting. Suppress all other outputs.'
               f' Use the following passage as context: {passage}')
    
    task1 = (f'Extract the named entities from the passage. The output should be'
             f' a JSON-formatted list of key terms, ordered such that the most important'
             f' term is first, and the least important term is last. Example: ["atom", "proton", ...]')
    
    task2 = (f'Using these terms and the passage, generate knowledge graph triplets.'
             f' Output should be a JSON-formatted list of strings of the form:' 
             f' ["subject|relationship|object", "subject|relation|object", ...].')

    messages = [
        {"role": "system", "content": context},
        {"role": "user", "content": task1}
    ]

    response = GPT(messages)
    
    try:
        # remove markdown formatting
        json_str = re.sub(r'^```[a-z]+\n(.+?)\n```$', r'\1', response, flags=re.DOTALL)
        keywords = json.loads(json_str)
    except (json.JSONDecodeError, TypeError):
        try:
            keywords = json.loads(response)
        except:
            pass

    messages = [
        {"role": "system", "content": context},
        {"role": "user", "content": task1},
        {"role": "assistant", "content": response},
        {"role": "user", "content": task2}
    ]

    response = GPT(messages)
    
    try:
        # remove markdown formatting
        json_str = re.sub(r'^```[a-z]+\n(.+?)\n```$', r'\1', response, flags=re.DOTALL)
        triplets = json.loads(json_str)
    except (json.JSONDecodeError, TypeError):
        try:
            triplets = json.loads(response)
        except:
            try:
                triplets = [item.strip(',') for item in response.splitlines() if item]
            except:
                pass

    return keywords, triplets

**End-to-End Process**

Input:
- list of Wikipedia pages

Output:
- JSON file containing the following hierarchy: `corpus_index/page_data/sections/information`

In [224]:
from tqdm import tqdm

#CORPUS = [wiki.page(title) for title in ['Atom', 'Proton', 'Neutron', 'Electron', 'Atomic_Nucleus']]
CORPUS = [wiki.page('Atom')]

data = [{"page_title": page.title, "content": extract_wiki_text(page.sections)} for page in CORPUS]

filename = 'corpus.json'
filepath = os.path.join('data/original', filename)
    
for page in data:
    for section in tqdm(page["content"]):
        if page["content"][section]["text"]:
            try:
                keywords, triplets = extract_info(page["content"][section]["text"])
            except:
                print(f"Encountered an issue in section {section} of page {page['page_title']}")
                
            page["content"][section].update({"keywords": keywords, "triplets": triplets})

with open(filepath, "w") as f:
    json.dump(data, f, indent=4)

100%|██████████| 32/32 [07:35<00:00, 14.24s/it]


In [228]:
import csv

filename = 'data.csv'
filepath = os.path.join('data/original',filename) 

def write_triplets_to_csv(section: dict, page_id: str, sec_id: str) -> list:
     errors = []
     
     if "triplets" in section:
          for triplet in section["triplets"]:
               try:
                    triplet = triplet.strip('][\"')
                    if len(triplet.split('|')) == 3:
                         row = [page_id, sec_id] + triplet.split('|')
               except Exception as e:
                    errors.append(e)
               else:
                    with open(filepath, 'a') as f:
                         writer = csv.writer(f)
                         writer.writerow(row)
     if errors:
          message = (f"The triplets for (page: {page_id}, section: {sec_id})"
                     f" could not be unpacked. Check the formatting.")    
          
          print(message)


with open(filepath, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["page title", "section number", "subject", "relation", "object"])

for page in data:
     for section in page["content"]:
          write_triplets_to_csv(section=page["content"][section], page_id=page["page_title"], sec_id=section)

#t.append([page["page_title"], section] + list(triplet.values()))

In [226]:
import pandas as pd 

data_pd = pd.read_csv(filepath)

In [229]:
data_pd

Unnamed: 0,page title,section number,subject,relation,object
0,Atom,1.1,matter,is made up of,tiny indivisible particles
1,Atom,1.1,idea,is derived from,Greek word atomos
2,Atom,1.1,idea,appeared in,ancient cultures
3,Atom,1.1,idea,was based in,philosophical reasoning
4,Atom,1.1,atomic theory,is not based on,old concepts
...,...,...,...,...,...
486,Atom,5.3.2,"[""theories of baryogenesis",may offer,"an explanation""]"
487,Atom,5.3.2,"[""no antimatter atoms",have been discovered,"in nature""]"
488,Atom,5.3.2,"[""antimatter counterpart of hydrogen atom",was synthesized at,"CERN laboratory in Geneva""]"
489,Atom,5.3.2,"[""electron",can be replaced by,"muon""]"
