<a href="https://colab.research.google.com/github/jermwatt/langchain-scraper/blob/main/scraping_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from dotenv import load_dotenv
from typing import Dict
from supabase import create_client, Client
# import transformers

# # import logging
# logging.getLogger("transformers").setLevel(logging.ERROR)
# logging.getLogger('sentence_transformers').setLevel(logging.ERROR)


def connect_to_supabase():
    # Load variables from .env file into os.environ
    load_dotenv()

    # Get variables from os.environ
    url: str = os.environ.get("SUPABASE_URL")
    key: str = os.environ.get("SUPABASE_KEY")

    supabase: Client = create_client(url, key)
    return supabase

def setup_database():
    # connect to supabase
    supabase = connect_to_supabase()
    
    # yield database
    return supabase
    
# get investigator data for given org_name
def get_investigator_data(supabase: Client) -> Dict:
    # Build the query to select the row with the specified value
    data,count = supabase.table('upitt_investigators').select("*").execute()    
    return data[1]

def insert_investigator_metadata_datapoint(supabase: Client, datapoint: Dict):
    data, count = supabase.table('investigator_metadata').insert(datapoint).execute()
    return data

In [2]:
import os
import pandas as pd
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI, AI21
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from dotenv import load_dotenv

# Load API keys from .env file
def load_api_keys():
    # Load variables from .env file into os.environ
    load_dotenv()

    # Get variables from os.environ
    key = os.environ["OPENAI_API_KEY"]
    # key = os.environ["AI21_API_KEY"]
    key = os.environ["SERPAPI_API_KEY"]

# Set LLM
def set_llm():
    # load llm with deterministic setting
    llm = OpenAI(temperature=0)
    # llm = AI21(temperature=0)

    return llm

# response schema for the output parser
response_schemas = [
    ResponseSchema(name="first_name", description="first name of investigator"),
    ResponseSchema(name="last_name", description="last name of investigator"),
    ResponseSchema(name="org_name", description="name of organization"),
    ResponseSchema(name="email", description="email address of investigator"),
    ResponseSchema(name="position", description="position of investigator"),
    ResponseSchema(name="department", description="department of investigator"),
]

# generate formatting instructions to tack onto the prompt
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

# construct prompt
prompt_one = PromptTemplate(
    input_variables=['first_name','last_name', 'org_name'],
    partial_variables={'format_instructions': format_instructions},
    template="answer the users question as best as possible.\n{format_instructions}\nwhat is the email address and academic department of {first_name} {last_name} from {org_name}?  are they an assistant professor, associate professor, full professor, or none of these?",
)

# load in API keys
load_api_keys()

# construct llm
llm = set_llm()

# construct chain
parser_chain = LLMChain(llm=llm, prompt=prompt_one)

def run_parser_chain(parser_chain,first_name,last_name,org_name):
    # run chain
    answer = parser_chain.run(first_name=first_name, last_name=last_name, org_name=org_name)

    # parse output
    parsed_answer = output_parser.parse(answer)
    return parsed_answer

In [3]:
supabase = setup_database()
data = get_investigator_data(supabase)

In [6]:
# run chain - gather email, department, and position for each investigator
final_datapoints = []
max_count = 5
for ind,row in enumerate(data):
    first_name= row['investigator_first_name']
    last_name= row['investigator_last_name']
    investigator_id = row['investigator_id']
    org_name= row['org_name']

    try:
        # run chain
        parsed_answer = run_parser_chain(parser_chain,first_name, last_name, org_name)

        # add investigator name
        parsed_answer['investigator_first_name'] = parsed_answer.pop("first_name")
        parsed_answer['investigator_last_name'] = parsed_answer.pop("last_name")

        # lower both
        parsed_answer['investigator_first_name'] = first_name
        parsed_answer['investigator_last_name'] = last_name

        # upper org_name
        parsed_answer['org_name'] = parsed_answer['org_name'].upper()

        # add investigator id
        parsed_answer['investigator_id'] = investigator_id

        # change position
        if parsed_answer['position'] == "None of these":
            parsed_answer['position'] = None

        # save to supabase
        d = insert_investigator_metadata_datapoint(supabase, parsed_answer)
    except Exception as e:
        print(e)
        pass

    # print progress
    print(f"Finished {ind+1} of {len(data)}")

    # break if max count reached
    # if ind >= max_count:
    #     print("Max count reached. Breaking loop.")
    #     break

{'code': '23505', 'details': 'Key (investigator_id)=(1857725) already exists.', 'hint': None, 'message': 'duplicate key value violates unique constraint "investigator_metadata_pkey"'}
Finished 1 of 1000
{'code': '23505', 'details': 'Key (investigator_id)=(1857848) already exists.', 'hint': None, 'message': 'duplicate key value violates unique constraint "investigator_metadata_pkey"'}
Finished 2 of 1000
{'code': '23505', 'details': 'Key (investigator_id)=(1857985) already exists.', 'hint': None, 'message': 'duplicate key value violates unique constraint "investigator_metadata_pkey"'}
Finished 3 of 1000
{'code': '23505', 'details': 'Key (investigator_id)=(1858194) already exists.', 'hint': None, 'message': 'duplicate key value violates unique constraint "investigator_metadata_pkey"'}
Finished 4 of 1000
{'code': '23505', 'details': 'Key (investigator_id)=(1858669) already exists.', 'hint': None, 'message': 'duplicate key value violates unique constraint "investigator_metadata_pkey"'}
Fini