In [1]:
from typing import List
import pandas as pd
from operator import itemgetter
from langchain.chains.openai_tools import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
load_dotenv()
os.getenv("API_KEY")
import openai

In [2]:
llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)

In [3]:
from langchain_community.vectorstores import Chroma
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_openai import OpenAIEmbeddings

In [4]:
# Implementing dynamic few-shot prompt
examples = [
    {
        "input": "List the names of colleges in Massachusetts:",
        "query": "SELECT INSTNM FROM public.\"HD2022\" WHERE STABBR = 'MA';"
    },
    {
        "input": "List the colleges in Massachusetts that offer graduate programs:",
        "query": "SELECT INSTNM FROM public.\"HD2022\" WHERE STABBR = 'MA' AND GROFFER = 1;"
    },
    {
        "input": "List the colleges in Massachusetts with their longitude and latitude coordinates:",
        "query": "SELECT INSTNM, LONGITUD, LATITUDE FROM public.\"HD2022\" WHERE STABBR = 'MA';"
    },
    {
        "input": "List the colleges in Massachusetts that are located in urban areas (LOCALE = 11):",
        "query": "SELECT INSTNM FROM public.\"HD2022\" WHERE STABBR = 'MA' AND LOCALE = 11;"
    },
    {
        "input": "List the colleges in Massachusetts along with their control type (e.g., Public, Private Nonprofit, Private For-Profit):",
        "query": "SELECT INSTNM, CASE CONTROL WHEN 1 THEN 'Public' WHEN 2 THEN 'Private Nonprofit' WHEN 3 THEN 'Private For-Profit' END AS CONTROL_TYPE FROM public.\"HD2022\" WHERE STABBR = 'MA';"
    },
    {
        "input": "List the colleges in Massachusetts along with their median ACT score:",
        "query": "SELECT INSTNM, ACT FROM public.\"HD2022\" WHERE STABBR = 'MA';"
    },
    {
        "input": "List the colleges in Massachusetts along with their degree levels offered:",
        "query": "SELECT INSTNM, CASE WHEN HLOFFER = 1 THEN 'Certificate' WHEN UGOFFER = 1 THEN 'Associate' WHEN GROFFER = 1 THEN 'Bachelor' WHEN HDEGOFR1 = 1 THEN 'Graduate' ELSE 'Unknown' END AS DEGREE_LEVELS_OFFERED FROM public.\"HD2022\" WHERE STABBR = 'MA';"
    },
    {
        "input": "Count the number of colleges in Massachusetts:",
        "query": "SELECT COUNT(*) FROM public.\"HD2022\" WHERE STABBR = 'MA';"
    },
    {
        "input": "List the colleges in Massachusetts along with their city and address:",
        "query": "SELECT INSTNM, CITY, ADDR FROM public.\"HD2022\" WHERE STABBR = 'MA';"
    },
    {
        "input": "List the colleges in Massachusetts along with their websites:",
        "query": "SELECT INSTNM, WEBADDR FROM public.\"HD2022\" WHERE STABBR = 'MA';"
    },
    {
        "input": "List the colleges in Massachusetts offering undergraduate programs (ICLEVEL = 1):",
        "query": "SELECT INSTNM FROM public.\"HD2022\" WHERE STABBR = 'MA' AND ICLEVEL = 1;"
    },
    {
        "input": "List the colleges in Massachusetts in rural areas (LOCALE = 41 for rural areas according to the IPEDS locale codes):",
        "query": "SELECT INSTNM FROM public.\"HD2022\" WHERE STABBR = 'MA' AND LOCALE = 41;"
    },
]

In [5]:
def get_example_selector():
    example_selector = SemanticSimilarityExampleSelector.from_examples(
        examples,
        OpenAIEmbeddings(),
        Chroma,
        k=5,
        input_keys=["input"],
    )
    return example_selector

In [6]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, FewShotChatMessagePromptTemplate, PromptTemplate

In [8]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.memory import ChatMessageHistory
from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool
from langchain.chains import create_sql_query_chain
from langchain_community.utilities.sql_database import SQLDatabase
# from dotenv import load_dotenv

In [9]:
from langchain.prompts import ChatPromptTemplate

from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import JSONLoader
from langchain_community.embeddings import OpenAIEmbeddings

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# import chromadb

# import os
# from dotenv import load_dotenv
# load_dotenv()
openai_api_key = os.getenv("API_KEY")

In [10]:
def metadata_func(record: dict, metadata: dict) -> dict:
    def column_retriever(ls):
        cname = []
        dtype = []
        cdesc = []
        for i in range(len(ls)):
            cname.append(record.get("Columns")[i].get("Column_Name"))
            dtype.append(record.get("Columns")[i].get("Data_Type"))
            cdesc.append(record.get("Columns")[i].get("Column_Description"))
        return cname, dtype, cdesc
    cname, dtype, cdesc = column_retriever(record.get("Columns"))

    metadata["Table_Name"] = record.get("Table_Name")
    metadata["Table_Description"] = record.get("Table_Description")
    metadata["Column_Names"] = str(cname)
    metadata["Data_Type"] = str(dtype)
    metadata["Column_Description"] = str(cdesc)
    # metadata["share"] = record.get("share")
    return metadata


embedding_function = OpenAIEmbeddings(
    openai_api_key=openai_api_key, model="text-embedding-ada-002")


def get_retriever():
    loader = JSONLoader(
        file_path="/Users/omkarsadekar/Documents/NEU Study Material/NEU Study Material/Capstone/ipedsllm/Data/tableinfo.json",
        jq_schema=".[].Table_Info[]",
        content_key="Table_Name",
        metadata_func=metadata_func,
    )
    documents = loader.load()

    db = Chroma.from_documents(documents, embedding_function)
    retriever = db.as_retriever()
    return retriever


model = ChatOpenAI()


retriever = get_retriever()

In [38]:
def get_table_info(query: str):

    template = """Answer the question based only on the following context:
    {context}
    Search for the table descriptions in the context and accordingly search for column names and associated column description. Include all relevant tables and columns which can be used by the downstream Text-to-SQL Agent to create SQL Queries containing Join, Filtering and Sub Query. Remember that unitid acts as the Primary Key while using Join operations.  
    Search for any information performing the following tasks:
    1. Table Names
    2. Table Descriptions
    3. Column Names
    4. Column Descriptions

    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    model = ChatOpenAI()

    table_chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
    )
    return table_chain.invoke(query)
query = "List of Institutes accepting secondary school GPA for getting admission in Undergrad program."




In [39]:
class Table(BaseModel):
    """Table in SQL database."""

    name: str = Field(description="Name of table in SQL database.")


def get_tables(tables: List[Table]) -> List[str]:
    tables = [table.name for table in tables]
    return tables


table_details = get_table_info(query)


In [40]:
print(table_details)

Based on the provided context, the relevant information for the downstream Text-to-SQL Agent to create SQL Queries containing Join, Filtering, and Sub Query is as follows:

1. Table Names:
- public."ADM2022"
- public."IC2022_CAMPUSES"

2. Table Descriptions:
- public."ADM2022": Contains information related to admissions for the year 2022.
- public."IC2022_CAMPUSES": Contains information about the campuses associated with the institutes.

3. Column Names:
- ADM.unitid
- IC.index
- IC.campusid
- IC.pcaddr
- IC.pccity
- ADM.admcon1

4. Column Descriptions:
- ADM.unitid: Primary Key in the "ADM2022" table, likely used for joining operations.
- IC.index: Likely used for joining operations with the unitid.
- IC.campusid: Identifier for the campus associated with the institute.
- IC.pcaddr: Physical address of the campus.
- IC.pccity: City where the campus is located.
- ADM.admcon1: Possibly a column indicating admission conditions, with a value of 1 indicating institutes accepting secondary 

In [41]:
table_info = table_details

In [42]:
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}\nSQLQuery:"),
        ("ai", "{query}"),
    ]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    example_selector=get_example_selector(),
    input_variables=["input", "top_k"],
)

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system",
         "You are a PostgreSQL expert. Given an input question, create a syntactically correct PostgreSQL query to run.\n\nHere is the relevant table info: {table_info}\n\nBelow are a number of examples of questions and their corresponding SQL queries.\n\nDO NOT MAKE ANY DML QUERIES SUCH AS INSERT, UPDATE, DELETE, OR ANY OTHER DDL QUERIES SUCH AS CREATE, ALTER, DROP"),
        few_shot_prompt,
        ("human", "{input}"),
    ]
)

answer_prompt = PromptTemplate.from_template(
    """Given the following user question, corresponding SQL query, and SQL result, answer the user question.

Question: {question}
SQL Query: {query}
SQL Result: {result}
Answer: """
)

In [43]:
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# LANGCHAIN_TRACING_V2 = os.getenv("LANGCHAIN_TRACING_V2")
# LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")
db_url = os.getenv("DB_URL")



db = SQLDatabase.from_uri(db_url)
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
generate_query = create_sql_query_chain(llm, db, final_prompt)
execute_query = QuerySQLDataBaseTool(db=db)
rephrase_answer = answer_prompt | llm | StrOutputParser()
    # chain = generate_query | execute_query
chain = (
        RunnablePassthrough.assign(query=generate_query).assign(
            result=itemgetter("query") | execute_query
        )
        | rephrase_answer
    )

  



response = chain.invoke(
        {"question": query })
 

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 39676 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [31]:
print(response)

The list of institutes in Massachusetts and their respective addresses can be found in the SQL result provided. Some examples include:
- Lowell Academy Hairstyling Institute at 136 Central St
- Hult International Business School at 1 Education Street
- American International College at 1000 State Street
- Amherst College at Boltwood Avenue
- Anna Maria College at 50 Sunset Lane

This list includes many more institutes and their addresses in Massachusetts.
