In [None]:
__author__ = "Jon Ball"
__version__ = "October 2023"

In [1]:
from gpt_utils import (start_chat, user_turn, system_turn)
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from tqdm import tqdm
import chromadb
import jinja2
import torch
import random
import json

In [None]:
# Set random seed 
random.seed(42)
torch.manual_seed(42)

In [2]:
model_name = "Muennighoff/SGPT-125M-weightedmean-nli-bitfit"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embedding = SentenceTransformerEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [3]:
persistent_client = chromadb.PersistentClient(path="../chroma")
#
manualDB = Chroma(
    client=persistent_client,
    collection_name="manual",
    embedding_function=embedding,
    )
#
gpt4DB = Chroma(
    client=persistent_client,
    collection_name="gpt4",
    embedding_function=embedding,
    )

In [4]:
with open("../prompts/examples/gpt4D.json", "r") as infile:
    gpt4D = json.load(infile)
print(f"{len(gpt4D)} examples loaded. GPT-4 will generate output labels for each example, to be used in few-shot prompting with CodeLLaMA.")

90 examples loaded. GPT-4 will generate output labels for each example, to be used in few-shot prompting with CodeLLaMA.


In [5]:
input = list(gpt4D.items())[0][1]["input"]
print("Input:", input)
for doc in manualDB.similarity_search(input, 2):
    print(doc.metadata["input"], "\n", doc.metadata["output"].strip())

Input: {'id': 'EJ344485', 'title': 'The Impact of Financial and Cultural Resources on Educational Attainment in the Netherlands.', 'author': ['De Graaf, Paul M.'], 'description': "Using data from a 1977 survey conducted in the Netherlands, where education is almost completely free, this study shows that the degree to which parents' socioeconomic status influences educational attainment has disappeared since 1950. During the same time period, however, influence on educational attainment of access to cultural resources has increased. (Author/JDH)", 'subject': ['Cultural Opportunities', 'Educational Attainment', 'Foreign Countries', 'Higher Education', 'Social Class', 'Socioeconomic Influences'], 'publicationtype': ['Journal Articles', 'Reports - Research'], 'publicationdateyear': 1986, 'language': ['English'], 'peerreviewed': 'T'}
{"id": "EJ502246", "title": "Social Capital and the Reproduction of Inequality: Information Networks among Mexican-Origin High School Students.", "author": ["S

In [6]:
class jinjaLoader():
    def __init__(self, template_dir, template_jinja):
        self.templateLoader = jinja2.FileSystemLoader(searchpath=template_dir)
        self.templateEnv = jinja2.Environment( loader=self.templateLoader )
        self.template = self.templateEnv.get_template( template_jinja )

    def render(self, templateVars):
        return self.template.render( templateVars )

In [7]:
jinjitsu = jinjaLoader("../prompts", "fewshot.prompt")

In [8]:
outputs = []
for k, v in tqdm(gpt4D.items()):
    # Isolate input
    input3 = v["input"]
    # Pull 2 most similar docs for few-shot examples
    similar_docs = manualDB.similarity_search(input, 2)
    # Load the vars
    input1 = similar_docs[0].metadata["input"]
    output1 = similar_docs[0].metadata["output"]
    input2 = similar_docs[1].metadata["input"]
    output2 = similar_docs[1].metadata["output"]
    templateVars = {"input1": input1, "output1": output1, "input2": input2, "output2": output2, "input3": input3, "output3": ""}
    PROMPT = jinjitsu.render(templateVars)
    # Start the chat
    chat = start_chat("Research assistant tasked with labeling articles published in Sociology of Education")
    chat = user_turn(chat, PROMPT)
    chat = system_turn(chat)
    # Write the output
    gpt4D[k] = {"input": input3, "output": chat[-1]["content"]}
    outputs.append(chat[-1]["content"])

100%|██████████| 90/90 [12:07<00:00,  8.08s/it]


In [9]:
for k, v in gpt4D.items():
    if not v["output"]:
        print(k)

In [10]:
# Save
with open("../prompts/examples/gpt4D.json", "w") as outfile:
    json.dump(gpt4D, outfile)