# Config: `gpt-3.5-turbo`

**OpenAI**

In [48]:
import os
import openai
import logging

openai.organization = 'org-ZSjLwLfwPKSKiv1oL1veHDLb'
openai.api_key = os.getenv("OPENAI_API_KEY")

logging.basicConfig(filename='responses.log', 
                    filemode='w', 
                    level=logging.INFO, 
                    format='%(levelname)s:%(asctime)s %(message)s', 
                    datefmt='%m-%d-%Y %H:%M:%S')

# GPT prompt-response wrapper
def GPT(messages: list) -> str:
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo", 
        temperature=0.3,
        messages=messages
    )

    reply = response.choices[0].message.content
    logging.info(messages)
    logging.info(reply)

    return reply

**Knowledge graph construction**

Prompt GPT-3.5 to extract keywords and triplets from passage sequentially. The format of the output varies, so there are a few exception catches:

1. The output is a markdown-formatted JSON
2. The output is JSON as-is
3. The output is an unformatted list

In [49]:
import json
import re

def extract_knowledge(passage: str) -> tuple:
    """Extract keywords and triplets from passage."""
    context = (f'This is an information extraction task; only perform the assigned tasks'
               f' and adhere to the given formatting. Suppress all other outputs.'
               f' Use the following passage as context: {passage}')
    
    task1 = (f'Extract the named entities from the passage. The output should be'
             f' a JSON-formatted list of key terms, ordered such that the most important'
             f' term is first, and the least important term is last. Example: ["atom", "proton", ...]')
    
    task2 = (f'Using these terms and the passage, generate knowledge graph triplets.'
             f' Output should be a JSON-formatted list of strings of the form:' 
             f' ["subject|relation|object", "subject|relation|object", ...].')

    messages = [
        {"role": "system", "content": context},
        {"role": "user", "content": task1}
    ]

    response = GPT(messages)
    
    try:
        # remove markdown formatting
        json_str = re.sub(r'^```[a-z]+\n(.+?)\n```$', r'\1', response, flags=re.DOTALL)
        keywords = json.loads(json_str)
    except (json.JSONDecodeError, TypeError):
        try:
            keywords = json.loads(response)
        except:
            pass

    messages = [
        {"role": "system", "content": context},
        {"role": "user", "content": task1},
        {"role": "assistant", "content": response},
        {"role": "user", "content": task2}
    ]

    response = GPT(messages)
    
    try:
        # remove markdown formatting
        json_str = re.sub(r'^```[a-z]+\n(.+?)\n```$', r'\1', response, flags=re.DOTALL)
        triplets = json.loads(json_str)
    except (json.JSONDecodeError, TypeError):
        try:
            triplets = json.loads(response)
        except:
            try:
                triplets = [item.strip(',') for item in response.splitlines() if item]
            except:
                pass

    return keywords, triplets

**End-to-End Process**

Input:
- list of Wikipedia pages

Output:
- JSON file containing the following hierarchy: `corpus_index/page_data/sections/information`

In [50]:
from tqdm import tqdm 
import csv
import os.path as osp
import pandas as pd

class OS_textbook():
    def __init__(self, filepath: str):
        self.data = pd.read_csv(filepath)

class OS_bio_2e(OS_textbook):
    filepath = "data/original/sentences_Biology_2e_parsed.csv"
    
    def __init__(self, chapter: int, section: int):
        super().__init__(self.filepath)
        
        self.chapter_num = chapter
        self.section_num = section

        self.text = self.extract_section_text()

    def __repr__(self):
        return f"OS_bio_2e(chapter={self.chapter_num}, section={self.section_num})"
    
    def extract_section_text(self):
        ch = self.data["chapter"] == self.chapter_num
        sec = self.data["section"] == self.section_num
        
        section_data = self.data[ch & sec]
        section_text = list(section_data.sentence.values)

        return section_text
    
    def generate_knowledge_graph(self, filepath):
        
        kw_filename = f"bio_ch{self.chapter_num}_s{self.section_num}_keywords.csv"
        trip_filename = f"bio_ch{self.chapter_num}_s{self.section_num}_triplets.csv"

        kw_filepath = osp.join(filepath, kw_filename)
        trip_filepath = osp.join(filepath, trip_filename)
        
        with open(kw_filepath, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(["keywords", "sentence"])

        with open(trip_filepath, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(["subject", "relation", "object", "sentence"])

        for sentence in tqdm(self.text):

            try:
                keywords, triplets = extract_knowledge(sentence)
                row = [keywords, sentence]
                
                with open(kw_filepath, 'a') as f:
                    writer = csv.writer(f)
                    writer.writerow(row)

                for triplet in triplets:
                    s,r,o = triplet.split('|')
                    row = [s, r, o, sentence]

                    with open(trip_filepath, 'a') as f:
                        writer = csv.writer(f)
                        writer.writerow(row)
            except:
                logging.info(f"Error encountered. Sentence: {sentence}")
            else:
                logging.info(f"Extracted knowledge. Sentence: {sentence}")

In [51]:
def extract_ch4():
    ch4 = [OS_bio_2e(chapter=4, section=num) for num in range(2,8)]
    
    for section in ch4:
        section.generate_knowledge_graph("data/pre-processed")

In [59]:
data = [pd.read_csv(f"data/pre-processed/bio_ch4_s{section}_triplets.csv") for section in range(2,8)]
data = pd.concat(data, axis=0, ignore_index=True)

In [92]:
for sentence in data.sentence.values:
    quads = data[data["sentence"] == sentence]
    triplets = quads.drop(["sentence"], axis=1)
    triplets_list = list(triplets.itertuples(index=False, name=None))
    

array(['A cell is the smallest unit of a living thing.',
       'Whether comprised of one cell (like bacteria) or many cells (like a human), we call it an organism.',
       'Whether comprised of one cell (like bacteria) or many cells (like a human), we call it an organism.',
       ...,
       'To conduct a virtual microscopy lab and review the parts of a cell, work through the steps of this interactive assignment.',
       'To conduct a virtual microscopy lab and review the parts of a cell, work through the steps of this interactive assignment.',
       'To conduct a virtual microscopy lab and review the parts of a cell, work through the steps of this interactive assignment.'],
      dtype=object)

In [107]:
import numpy as np

list(d.loc[[0, 1]].itertuples(index=False, name=None))

[('cell',
  'is smallest unit of',
  'living thing',
  'A cell is the smallest unit of a living thing.'),
 ('organism',
  'is called',
  'bacteria',
  'Whether comprised of one cell (like bacteria) or many cells (like a human), we call it an organism.')]

In [None]:
def fact_verification(triplets: list[tuple], context: str):
    
    triplets = '\n'.join(triplets)

    f"{triplets}"
