In [1]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
import chromadb
import torch 
import random
import json
import time

In [None]:
# Set random seed 
random.seed(42)
torch.manual_seed(42)

In [2]:
model_name = "Muennighoff/SGPT-125M-weightedmean-nli-bitfit"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embedding = SentenceTransformerEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Manually labeled examples for RAG

Get manually labeled examples:

In [3]:
with open("../prompts/examples/manualD.json", "r") as infile:
    manualD = json.load(infile)

In [4]:
texts = []
metadatas = []
for example in manualD.items():
    texts.append(str(example[1]["input"]))
    metadatas.append(example[1])

In [5]:
print(texts[0])

{"id": "EJ353123", "title": "Social Class Differences in Family-School Relationships: The Importance of Cultural Capital.", "author": ["Lareau, Annette"], "description": "Summarizes a qualitative study of family/school relationships in White working class and middle class areas. Concludes that schools have standardized views of the proper role of parents in schooling. Suggests that the concept of cultural capital is useful to understand social class differences in children's school experiences. (Author/RKM)", "subject": ["Cultural Differences", "Educational Research", "Elementary Education", "Family (Sociological Unit)", "Family School Relationship", "Middle Class", "Parent School Relationship", "Social Attitudes", "Social Class", "Social Differences", "Social Science Research", "Social Sciences", "Sociology"], "publicationtype": ["Journal Articles", "Reports - Research"], "publicationdateyear": 1987, "language": ["English"], "peerreviewed": "T"}


In [6]:
%%time
vectordb = Chroma.from_texts(
    texts=texts,
    collection_name="manual",
    embedding=embedding,
    metadatas=metadatas,
    persist_directory="../chroma")

CPU times: user 607 ms, sys: 306 ms, total: 913 ms
Wall time: 434 ms


# GPT-4 labeled examples for RAG

In [7]:
with open("../prompts/examples/gpt4D.json", "r") as infile:
    gpt4D = json.load(infile)

In [8]:
texts = []
metadatas = []
for example in gpt4D.items():
    texts.append(str(example[1]["input"]))
    metadatas.append(example[1])

In [9]:
print(texts[0])
print(metadatas[0])

{"id": "EJ344485", "title": "The Impact of Financial and Cultural Resources on Educational Attainment in the Netherlands.", "author": ["De Graaf, Paul M."], "description": "Using data from a 1977 survey conducted in the Netherlands, where education is almost completely free, this study shows that the degree to which parents' socioeconomic status influences educational attainment has disappeared since 1950. During the same time period, however, influence on educational attainment of access to cultural resources has increased. (Author/JDH)", "subject": ["Cultural Opportunities", "Educational Attainment", "Foreign Countries", "Higher Education", "Social Class", "Socioeconomic Influences"], "publicationtype": ["Journal Articles", "Reports - Research"], "publicationdateyear": 1986, "language": ["English"], "peerreviewed": "T"}
{'input': '{"id": "EJ344485", "title": "The Impact of Financial and Cultural Resources on Educational Attainment in the Netherlands.", "author": ["De Graaf, Paul M."]

In [10]:
%%time
vectordb = Chroma.from_texts(
    texts=texts,
    collection_name="gpt4",
    embedding=embedding,
    metadatas=metadatas,
    persist_directory="../chroma")

CPU times: user 4.99 s, sys: 1.68 s, total: 6.67 s
Wall time: 1.89 s
