In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

In [11]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from langchain_chroma import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import (
    RunnableSequence,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.output_parsers import StrOutputParser

In [4]:
embeddings = OpenAIEmbeddings(openai_api_key=api_key, model="text-embedding-ada-002")

vectorstore = Chroma(
    persist_directory="./chroma_data_science_course",
    embedding_function=embeddings,
    collection_name="data_science_course",
)

In [5]:
len(vectorstore.get()["documents"])

21

In [6]:
retriever = vectorstore.as_retriever(
    search_type="mmr", search_kwargs={"k": 3, "lambda_mult": 0.7}
)

In [7]:
TEMPLATE = """"
Answer the following question:
{question}

To answer the question, use the following context:
{context}

At the end of the response, specify the name of the lecture this context is taken from in the format:
Resources: *Lecture title*
where *Lecture title* should be substituted with the actual lecture title.
"""

In [8]:
promp_template = PromptTemplate(
    template=TEMPLATE,
    input_variables=["question", "context"],
)

In [9]:
chat = ChatOpenAI(
    openai_api_key=api_key,
    model="gpt-4.1-mini",
    temperature=0,
    seed=42,
    max_completion_tokens=250,
)

In [10]:
question = "What software do data scientists use?"

In [29]:
chain = (
    {
        "context": retriever,
        "question": RunnablePassthrough(),
    }
    | promp_template
    | chat
    | StrOutputParser()
)

In [30]:
res = chain.invoke(question)

In [31]:
res

'Data scientists commonly use R and Python as their primary software tools. These programming languages are highly popular because they can manipulate data effectively and are integrated within multiple data and data science software platforms. They are versatile, capable of handling not only mathematical and statistical computations but also a wide variety of business and data-related problems from start to finish.\n\nIn addition to R and Python, data scientists also use software frameworks and tools designed to handle big data and business intelligence. For example, Hadoop is a software framework that distributes computational tasks across multiple computers to manage the complexity and computational intensity of big data. For business intelligence and data visualization, tools like Power BI, SAS, Qlik, and especially Tableau are widely used.\n\nResources: Programming Languages & Software Employed in Data Science - All the Tools You Need'

In [28]:
print(res.content)

Data scientists commonly use software and programming languages such as R and Python, which are highly popular due to their ability to manipulate data and integrate with multiple data science platforms. These tools are versatile and can handle a wide range of business and data-related problems from start to finish. Additionally, for handling big data, frameworks like Hadoop are used to distribute computational tasks across multiple computers. For business intelligence and data visualization, software such as Power BI, SAS, Qlik, and especially Tableau are widely employed.

Resources: Programming Languages & Software Employed in Data Science - All the Tools You Need


In [42]:
response = chain.stream(question)

In [39]:
response

<generator object RunnableSequence.stream at 0x12d7123e0>

In [43]:
for i in response:
    print(i, end="")

Data scientists commonly use R and Python as their primary software tools. These programming languages are highly popular because they can manipulate data effectively and are integrated within multiple data and data science software platforms. They are versatile, capable of handling not only mathematical and statistical computations but also a wide variety of business and data-related problems from start to finish.

In addition to R and Python, data scientists also use software frameworks and tools designed for handling big data and business intelligence. For example, Hadoop is a framework that distributes computational tasks across multiple computers to manage big data efficiently. For business intelligence and data visualization, tools like Power BI, SAS, Qlik, and especially Tableau are widely used.

Resources: Programming Languages & Software Employed in Data Science - All the Tools You Need