In [None]:
# Importing all variables from .env for mongodb
from langchain_mongodb import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from pymongo.server_api import ServerApi
from dotenv import load_dotenv
import os

load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
MONGO_USER = os.getenv("MONGO_USER")
MONGO_PASSWORD = os.getenv("MONGO_PASSWORD")
MONGO_CLUSTER = os.getenv("MONGO_CLUSTER")
MONGO_DATABASE = os.getenv("MONGO_DATABASE")

uri = f"mongodb+srv://{MONGO_USER}:{MONGO_PASSWORD}@{MONGO_CLUSTER}.igv4jfg.mongodb.net/?retryWrites=true&w=majority&appName={MONGO_CLUSTER}"

client = MongoClient(uri, server_api=ServerApi('1'))
db = client.get_database(MONGO_DATABASE)

rev_embeddings_collection = db["review_embeddings"]

In [None]:
# Data Filtering and converting into clean dictionary, later for embedding
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter

loader = TextLoader("docs/fake_internships_info.md")
docs = loader.load()


splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
    ("####", "company"), ("###", "overview"), ("##", "comments"), ("#", "sources")
])

split_docs = splitter.split_text(docs[0].page_content)


llm = init_chat_model(model="llama3-8b-8192", model_provider="groq")
embeddings = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

def return_embeddings(string: str) -> list[float]:
    embed = embeddings.embed_query(string)
    return embed

data = []
info = {}

for content in split_docs:
    text = content.page_content.strip()

    if text.startswith("Company:"):
        # If there's already an entry, save it before starting new
        if info:
            data.append(info)
        info = {"company": text, "overview": "", "comments": "", "sources": []}

    elif text.startswith("Overview:"):
        info["overview"] = text

    elif text.startswith("Comments:"):
        info["comments"] = text

    elif text.startswith("Sources from Reddit:"):
        info["sources"] = text.replace("Sources from Reddit: ", "").split(",")
 
# Append the last collected company info
if info:
    data.append(info)
    

