In [1]:
import os
import pandas as pd
import numpy as np

import chromadb

from langchain.text_splitter import RecursiveCharacterTextSplitter


# Imports needed for vector store with huggingface
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader

# Imports needed for vector store/model predictions with distilbert
from transformers import DistilBertTokenizer, DistilBertModel
import torch

In [2]:
# Embedding function for Distilbert
tokenizer_distilbert = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')

def create_embedding_distilbert(text):
    # Tokenize input text
    inputs = tokenizer_distilbert(text, return_tensors='pt', padding=True, truncation=True)
    # Get model outputs
    with torch.no_grad():
        outputs = model_distilbert(**inputs)
    # Take the mean of the last hidden state to get the sentence embedding
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [3]:
client = chromadb.PersistentClient(path=".")
collection = client.get_or_create_collection("astro_hf")

In [17]:
# Make vector store statbel_huggingface with unnormalized huggingface embedding
# Load the text document
from langchain.vectorstores import Chroma

loader = TextLoader("input_data/astronomy.txt")
documents = loader.load()

# Split the text into chunks with LangChain's text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                               chunk_overlap=40,
                                               separators=["\n\n", "\n", ". ", "! ", "? "],
                                               keep_separator=True)
docs = text_splitter.split_documents(documents)

# Filter chunks to ensure they meet the minimum size requirement
min_chunk_size = 200
filtered_chunks = [chunk for chunk in docs if len(chunk.page_content) >= min_chunk_size]



hf_embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

recreate_vectorstore = 1
if recreate_vectorstore:
    vector_store = Chroma.from_documents(docs, hf_embedding_function)

# Check out some chunks created by RecursiveCharacterTextSplitter
for i, chunk in enumerate(docs):
    if i < 5:
        print(f"Chunk {i + 1}:")
        print(f"Length: {len(chunk.page_content)}")
        print(chunk.page_content)  # or chunk['page_content'] if it's in dictionary form
        print("-" * 50)

Chunk 1:
Length: 614
Astronomy is a natural science that studies celestial objects and the phenomena that occur in the cosmos. It uses mathematics, physics, and chemistry in order to explain their origin and their overall evolution. Objects of interest include planets, moons, stars, nebulae, galaxies, meteoroids, asteroids, and comets. Relevant phenomena include supernova explosions, gamma ray bursts, quasars, blazars, pulsars, and cosmic microwave background radiation. More generally, astronomy studies everything that originates beyond Earth's atmosphere. Cosmology is a branch of astronomy that studies the universe as a whole.
--------------------------------------------------
Chunk 2:
Length: 945
Astronomy is one of the oldest natural sciences. The early civilizations in recorded history made methodical observations of the night sky. These include the Egyptians, Babylonians, Greeks, Indians, Chinese, Maya, and many ancient indigenous peoples of the Americas. In the past, astronomy in

In [19]:
query = "What kind of phenomena can be explained with the Big Bang theory?"
results = vector_store.similarity_search(query, k=3)  # Retrieve top 3 matches

# Combine the retrieved documents to use as context
context = "\n\n".join(result.page_content for result in results)

for result in results:
    print(result.page_content)
    print("-" * 50)


The existence of the Earth's galaxy, the Milky Way, as its own group of stars was only proved in the 20th century, along with the existence of "external" galaxies. The observed recession of those galaxies led to the discovery of the expansion of the Universe.[42] Theoretical astronomy led to speculations on the existence of objects such as black holes and neutron stars, which have been used to explain such observed phenomena as quasars, pulsars, blazars, and radio galaxies. Physical cosmology made huge advances during the 20th century. In the early 1900s the model of the Big Bang theory was formulated, heavily evidenced by cosmic microwave background radiation, Hubble's law, and the cosmological abundances of elements
--------------------------------------------------
The existence of the Earth's galaxy, the Milky Way, as its own group of stars was only proved in the 20th century, along with the existence of "external" galaxies. The observed recession of those galaxies led to the disco

In [20]:
from transformers import pipeline
# Use some text-generation models and compare
#generator = pipeline("text-generation", model="distilgpt2")

query_w_context = f"Given the context: {context}, answer the question: {query}"


t5_generator = pipeline("text2text-generation", model="t5-small", device=0 if torch.cuda.is_available() else -1, truncation=True)
t5_output = t5_generator(query_w_context, max_length=500)

distilbert_generator = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", max_length=500, num_return_sequences=1)
#gpt2_output = distilbert_generator(query, max_length=500, num_return_sequences=1)
result = distilbert_generator(question=query, context=context)


distilgpt2_generator = pipeline("text-generation", model="distilgpt2", device=0 if torch.cuda.is_available() else -1, truncation=True, pad_token_id=50256)
distilgpt2_output = distilgpt2_generator(query_w_context, max_length=500, num_return_sequences=1)



# Print only the generated text responses
print("\nT5 Output:\n")
print(t5_output[0]['generated_text'])
print("-" * 50)
print("\nDistilbert Output:\n")
print(result['answer'])
print("-" * 50)
print("\nDistilGPT-2 Output:\n")
print(distilgpt2_output[0]['generated_text'])


T5 Output:

quasars, pulsars, blazars, and radio galaxies
--------------------------------------------------

Distilbert Output:

observed phenomena
--------------------------------------------------

DistilGPT-2 Output:

Given the context: The existence of the Earth's galaxy, the Milky Way, as its own group of stars was only proved in the 20th century, along with the existence of "external" galaxies. The observed recession of those galaxies led to the discovery of the expansion of the Universe.[42] Theoretical astronomy led to speculations on the existence of objects such as black holes and neutron stars, which have been used to explain such observed phenomena as quasars, pulsars, blazars, and radio galaxies. Physical cosmology made huge advances during the 20th century. In the early 1900s the model of the Big Bang theory was formulated, heavily evidenced by cosmic microwave background radiation, Hubble's law, and the cosmological abundances of elements

The existence of the Earth's 