In [None]:
!pip install openai
!pip install llama-index
!pip install PyPDF2
!pip install guardrails-ai
!pip install langchain
!pip install kor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 KB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 KB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.

## LLM Creates Rail Spec

In [None]:
from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader
from llama_index.output_parsers import GuardrailsOutputParser
from llama_index.llm_predictor import StructuredLLMPredictor
from llama_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
from llama_index.prompts.default_prompts import DEFAULT_TEXT_QA_PROMPT_TMPL, DEFAULT_REFINE_PROMPT_TMPL

import json
import os

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "{your key here}" # place your key here

I first use an LLM to extract the values of column 'Pay Scale Area (Verifiable Years of Service). The idea is to use these values as an index our LLM can use to extract information from each row. To ensure LLM outputs a list object, I use Guardrails.

In [None]:
llm_predictor = StructuredLLMPredictor()
documents = SimpleDirectoryReader(input_dir='/content').load_data()
index = GPTSimpleVectorIndex.from_documents(documents)

rail_spec = ("""
<rail version="0.1">
  <output>
    <string name="step_names" format="list" on-fail-max-len="reask" description="the values in column 'Pay Scale Area'" />
  </output>

  <prompt> 
    Return your response as a string. 

    @xml_prefix_prompt

    {output_schema}

    @json_suffix_prompt_v2_wo_none
  </prompt>
</rail>
""")

output_parser = GuardrailsOutputParser.from_rail_string(rail_spec, llm=llm_predictor.llm)

fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)
fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)

qa_prompt = QuestionAnswerPrompt(fmt_qa_tmpl, output_parser=output_parser)
refine_prompt = RefinePrompt(fmt_refine_tmpl, output_parser=output_parser)

response = index.query(
      "What are the values in column 'Pay Scale Area'?",
      text_qa_template=qa_prompt,
      refine_template=refine_prompt,
  )

step_names = json.loads(response.response)['step_names']



The output is a list we will iterate through to generate variations of rail_spec; one spec for each row.

In [None]:
print(step_names)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', 'Career Increment A', 'Career Increment B', 'Career Increment C', 'Career Increment D', 'Career Increment E']


## LLM Outputs JSON

We next create a rail specification for LLM to output JSON objects.

In [None]:
rail_spec = ("""
<rail version="0.1">
  <output>
    <object name="salary_schedule" description="salary schedule for pay scale area {area}" >
      <string name="pay_scale_area" format="max-len: 1" on-fail-max-len="reask" description="pay scale area {area} returned as float" />
      <float name="base_salary"  format="float" on-fail-float="reask" description="base salary for Pay Scale Area {area} returned as float" />
      <float name="qtea_addon" format="float" on-fail-float="reask" description="qtea add-on for Pay Scale Area {area} returned as float" />
      <float name="fwea_addon" format="float" on-fail-float="reask" description="fwea add-on for Pay Scale Area {area} returned as float" />
      <float name="total_annual_salary" format="float" on-fail-float="reask" description="total annual salary for Pay Scale Area {area} returned as float" />
      <float name="per_diem" format="float" on-fail-float="reask" description="per diem for Pay Scale Area {area} returned as float" />
    </object>
  </output>

  <prompt>
    Return all numbers in your response in the Float format. Remove the commas from the numeric values since commas are not allowed in JSON numbers.
    Each row should be returned only once.

    @xml_prefix_prompt

    {output_schema}

    @json_suffix_prompt_v2_wo_none
  </prompt>
</rail>
""")

In [None]:
output_dict = {}
output_list = []

for area in step_names:
  formatted_spec = rail_spec.format(area=area, output_schema="{output_schema}")
  output_parser = GuardrailsOutputParser.from_rail_string(formatted_spec, llm=llm_predictor.llm)

  fmt_qa_tmpl = output_parser.format(DEFAULT_TEXT_QA_PROMPT_TMPL)
  fmt_refine_tmpl = output_parser.format(DEFAULT_REFINE_PROMPT_TMPL)

  qa_prompt = QuestionAnswerPrompt(fmt_qa_tmpl, output_parser=output_parser)
  refine_prompt = RefinePrompt(fmt_refine_tmpl, output_parser=output_parser)

  response = index.query(
        "What is the salary schedule in this document?",
        text_qa_template=qa_prompt,
        refine_template=refine_prompt,
    )
  
  try:
    output_list.append(
        json.loads(response.response)['salary_schedule']
    )
    print(response)

  except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")
    output_list.append(response.response)
  
with open('example.json', 'w') as file:
  file.write(json.dumps(output_list))




{
    "salary_schedule": {
        "pay_scale_area": 1.0,
        "base_salary": 61159.88,
        "qtea_addon": 4885.0,
        "fwea_addon": 3884.0,
        "total_annual_salary": 69928.88,
        "per_diem": 380.05
    }
}

{
    "salary_schedule": {
        "pay_scale_area": 2.0,
        "base_salary": 64764.94,
        "qtea_addon": 3724.0,
        "fwea_addon": 4112.0,
        "total_annual_salary": 72600.94,
        "per_diem": 394.57
    }
}

{
    "salary_schedule": {
        "pay_scale_area": 3.0,
        "base_salary": 68373.18,
        "qtea_addon": 3962.0,
        "fwea_addon": 4342.0,
        "total_annual_salary": 76677.18,
        "per_diem": 416.72
    }
}

{
    "salary_schedule": {
        "pay_scale_area": 4.0,
        "base_salary": 71980.36,
        "qtea_addon": 4022.0,
        "fwea_addon": 4571.0,
        "total_annual_salary": 80573.36,
        "per_diem": 437.9
    }
}

{
    "salary_schedule": {
        "pay_scale_area": 5.0,
        "base_salary": 75585.4