In [2]:
import os
import pandas as pd
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI, AI21
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from dotenv import load_dotenv

# Load API keys from .env file
def load_api_keys():
    # Load variables from .env file into os.environ
    load_dotenv()

    # Get variables from os.environ
    key = os.environ["OPENAI_API_KEY"]
    key = os.environ["AI21_API_KEY"]
    key = os.environ["SERPAPI_API_KEY"]

# Set LLM
def set_llm():
    # load llm with deterministic setting
    llm = OpenAI(temperature=0)
    # llm = AI21(temperature=0)

    return llm

# response schema for the output parser
response_schemas = [
    ResponseSchema(name="first_name", description="first name of investigator"),
    ResponseSchema(name="last_name", description="last name of investigator"),
    ResponseSchema(name="org_name", description="name of organization"),
    ResponseSchema(name="email", description="email address of investigator"),
    ResponseSchema(name="position", description="position of investigator"),
]

# generate formatting instructions to tack onto the prompt
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

# construct prompt
prompt_one = PromptTemplate(
    input_variables=['first_name','last_name', 'org_name'],
    partial_variables={'format_instructions': format_instructions},
    template="answer the users question as best as possible.\n{format_instructions}\nwhat is the email address and academic department of {first_name} {last_name} from {org_name}?  are they an assistant professor, associate professor, full professor, or none of these?",
)

# load in API keys
load_api_keys()

# construct llm
llm = set_llm()

# construct chain
parser_chain = LLMChain(llm=llm, prompt=prompt_one)

def run_parser_chain(parser_chain,first_name, last_name, org_name):
    # run chain
    answer = parser_chain.run(first_name=first_name, last_name=last_name, org_name=org_name)

    # parse output
    parsed_answer = output_parser.parse(answer)
    return parsed_answer

In [3]:
# load load in sample
data = pd.read_csv("datasets/upitt_data.csv",nrows=10).to_dict('records')

# run chain - gather email, department, and position for each investigator
final_datapoints = []
max_count = 2
for ind,row in enumerate(data):
    first_name= row['investigator_first_name']
    last_name= row['investigator_last_name']
    org_name= row['org_name']

    try:
        parsed_answer = run_parser_chain(parser_chain,first_name, last_name, org_name)
        final_datapoints.append(parsed_answer)
    except Exception as e:
        print(e)
        pass

    # print progress
    print(f"Finished {ind+1} of {len(data)}")

    # break if max count reached
    if ind >= max_count:
        print("Max count reached. Breaking loop.")
        break

Finished 1 of 10
Finished 2 of 10
Finished 3 of 10
Max count reached. Breaking loop.


In [4]:
final_datapoints

[{'first_name': 'Paula',
  'last_name': 'Monaghan-Nichols',
  'org_name': 'University of Pittsburgh at Pittsburgh',
  'email': 'paulamn@pitt.edu',
  'position': 'Assistant Professor'},
 {'first_name': 'Aaron',
  'last_name': 'Barchowsky',
  'org_name': 'University of Pittsburgh at Pittsburgh',
  'email': 'abarchow@pitt.edu',
  'position': 'Assistant Professor'},
 {'first_name': 'Aaron',
  'last_name': 'Batista',
  'org_name': 'University of Pittsburgh at Pittsburgh',
  'email': 'abatista@pitt.edu',
  'position': 'Assistant Professor'}]