In [2]:
from dotenv import load_dotenv
import os
print(load_dotenv('../.env'))
print(os.environ['LANGSMITH_PROJECT'])
os.environ['LANGSMITH_TRACING']="true"
os.environ['USER_AGENT'] = 'myagent'

True
agentic-ops


# Creating vector store for RAG with HuggingFace questions

In [3]:
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def preprocess_dataset(docs_list):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=700,
        chunk_overlap=50,
        disallowed_special=()
    )
    doc_splits = text_splitter.split_documents(docs_list)
    return doc_splits
# https://huggingface.co/datasets/m-ric/transformers_documentation_en
transformers_doc = HuggingFaceDatasetLoader("m-ric/transformers_documentation_en", "text")
docs = preprocess_dataset(transformers_doc.load()[:50])

vectorstore = QdrantVectorStore.from_documents(
    docs,
    OpenAIEmbeddings(model=os.environ["EMBEDDING_MODEL"]),
    location=":memory:",
    collection_name="documentations",
)
retriever = vectorstore.as_retriever()

In [4]:
from langsmith import Client

example_inputs = [
    ("How do I upload a model to the Hugging Face Hub?", "To upload a model to the Hugging Face Hub, first install the huggingface_hub library with `pip install huggingface_hub`. Then, use the `login` function with your auth token: `huggingface_hub.login()`. Next, create a model repository with `create_repo()` and use `push_to_hub()` from the appropriate model class. For Transformers models, most model classes have the `push_to_hub()` method built-in. For custom models, you can use `huggingface_hub.upload_file()` or `huggingface_hub.upload_folder()` to upload your model files."),

    ("What is the Hugging Face Hub?", "The Hugging Face Hub is a platform for sharing and discovering machine learning models, datasets, and demo applications. It serves as a central repository where users can freely access thousands of pre-trained models across various domains like NLP, computer vision, and audio processing. The Hub enables version control for machine learning assets, provides model documentation through model cards, offers a collaborative environment for the AI community, and integrates seamlessly with popular ML frameworks like PyTorch and TensorFlow."),

    ("How do I fine-tune a pre-trained model from Hugging Face?", "To fine-tune a pre-trained model from Hugging Face, first install the transformers library with `pip install transformers`. Next, load the pre-trained model and tokenizer using `from_pretrained()`. Prepare your dataset and process it with the tokenizer. Then, initialize a `Trainer` object with your model, training arguments, and datasets. Finally, call `trainer.train()` to start fine-tuning. You can also use the `SFTTrainer` from TRL library for more specialized supervised fine-tuning, especially for language models. After training, save your model with `save_pretrained()` or push it to the Hub with `push_to_hub()`."),

    ("How do I use Accelerate for distributed training with Hugging Face?", "To use Accelerate for distributed training with Hugging Face, start by installing it with `pip install accelerate`. Initialize your model, optimizer, and dataloaders normally. Wrap them with Accelerate by creating an accelerator object (`accelerator = Accelerate()`) and then using `model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)`. Write your training loop as usual, but use `accelerator.backward(loss)` instead of `loss.backward()`. Accelerate automatically handles device placement, gradient synchronization, and mixed precision training across multiple GPUs or TPUs. You can launch distributed training with the `accelerate launch` command."),

    ("What is the difference between Tokenizers and Transformers libraries?", "The Tokenizers library is specialized for fast text tokenization, focusing on performance and customization of the tokenization process. It provides implementations of popular tokenization algorithms with a Rust backend for speed. The Transformers library, on the other hand, is a comprehensive framework for using pre-trained transformer models, including loading, fine-tuning, and inference with models like BERT, GPT, T5, etc. While Transformers includes tokenization capabilities, the dedicated Tokenizers library offers more advanced features like parallel processing, memory mapping, and customizable components (normalizers, pre-tokenizers, etc.) for building specialized tokenization pipelines."),

    ("How do I create a custom dataset with the Datasets library?", "To create a custom dataset with the Hugging Face Datasets library, you have several options. For local files, use `load_dataset('csv', data_files='path/to/file.csv')` or similar functions based on your file format. For custom data structures, create a dictionary of lists with `Dataset.from_dict({'text': [...], 'label': [...]})`. For more complex scenarios, define a generator function that yields examples and use `Dataset.from_generator(generator_function)`. You can also create a dataset script by defining `_info()`, `_split_generators()`, and `_generate_examples()` functions, and then placing it in the Datasets library's scripts directory or passing it directly to `load_dataset()`. After creation, you can save your dataset locally with `dataset.save_to_disk()` or share it on the Hub with `dataset.push_to_hub()`."),

    ("How do I enable mixed precision training with Hugging Face?", "To enable mixed precision training with Hugging Face, you have multiple options. With the Trainer API, simply set `fp16=True` in your TrainingArguments: `training_args = TrainingArguments(fp16=True, ...)`. If using Accelerate, initialize it with `accelerator = Accelerate(mixed_precision='fp16')` or set the configuration via `accelerate config`. For manual training loops with PyTorch, you can use `torch.cuda.amp.autocast()` context manager around your forward pass and scale gradients with `torch.cuda.amp.GradScaler()`. Alternatively, for even easier setup with any training loop, use the `deepspeed` integration by specifying a DeepSpeed config file in your TrainingArguments or Accelerate setup."),

    ("What is a model card and how do I create one?", "A model card is a documentation file providing essential information about a machine learning model, including its intended use, limitations, training data, evaluation results, and ethical considerations. To create a model card on Hugging Face Hub, create a README.md file in your model repository with structured information following the model card template. Include sections like Model Description, Intended Uses, Training Data, Evaluation Results, Limitations, and Bias and Fairness considerations. You can use the `modelcards` library with `pip install modelcards` to programmatically create model cards using Python. The card should help users understand when and how to use your model appropriately, along with its performance characteristics and potential risks."),

    ("How do I use PEFT for efficient fine-tuning?", "To use PEFT (Parameter-Efficient Fine-Tuning) with Hugging Face, first install the library with `pip install peft`. Load your pre-trained model with `model = AutoModelForCausalLM.from_pretrained(\"model_name\")`. Configure your PEFT method by creating a configuration, such as `config = LoraConfig(...)` for LoRA fine-tuning. Wrap your model with the PEFT wrapper using `model = get_peft_model(model, config)`. Then proceed with training as usual, either with the Trainer API or your custom training loop. PEFT supports multiple efficient fine-tuning methods including LoRA, QLoRA, Prefix Tuning, P-Tuning, and Prompt Tuning, allowing you to fine-tune large models with significantly reduced memory requirements by training only a small subset of parameters."),

    ("How do I quantize a model with bitsandbytes in Hugging Face?", "To quantize a model with bitsandbytes in Hugging Face, first install the bitsandbytes library with `pip install bitsandbytes`. Then, when loading your model with the `from_pretrained()` method, specify the quantization parameters by adding `load_in_8bit=True` or `load_in_4bit=True` arguments, along with other options like `llm_int8_threshold` or `bnb_4bit_compute_dtype`. For example: `model = AutoModelForCausalLM.from_pretrained(\"model_name\", load_in_4bit=True, bnb_4bit_quant_type=\"nf4\")`. This allows loading large models with significantly reduced memory usage. For more advanced quantization configurations, you can use `BitsAndBytesConfig` to specify detailed parameters like: `model = AutoModelForCausalLM.from_pretrained(\"model_name\", quantization_config=BitsAndBytesConfig(load_in_4bit=True))`."),

    ("How do I create a Gradio demo for my Hugging Face model?", "To create a Gradio demo for your Hugging Face model, first install Gradio with `pip install gradio`. Load your model and tokenizer with the Transformers library. Define a prediction function that takes user inputs, processes them with your model, and returns the results. Then use Gradio's interface creation functions to build your UI, connecting your prediction function to appropriate input and output components. For example, `demo = gr.Interface(fn=predict, inputs=\"text\", outputs=\"text\")`. Finally, launch your demo with `demo.launch()`. You can customize the interface with various input types (text, image, audio), output visualizations, examples, and styling options. To deploy on Hugging Face Spaces, create a Space on the Hub, add your demo code in an app.py file, and include requirements.txt with necessary dependencies."),
]


## TODO: create correct keys and get it from UI with 

In [5]:
client = Client()
dataset_id = "e022554a-73eb-45cb-a357-c9ebc90c5901"

# Prepare inputs and outputs for bulk creation
inputs = [{"question": input_prompt} for input_prompt, _ in example_inputs]
outputs = [{"output": output_answer} for _, output_answer in example_inputs]

client.create_examples(
  inputs=inputs,
  outputs=outputs,
  dataset_id=dataset_id,
)

{'example_ids': ['a954e542-2241-441a-b1b6-1279be49f35d',
  '4d63d801-e8ef-49d4-9dd6-bf2a0ec71620',
  'd8ee81da-d760-4c62-a409-ea3eb0de6712',
  '9d61716c-75f3-4d90-b5ae-aebfa7963e73',
  '3f5d1f82-4f06-41ca-be0e-878d8b4c8cd1',
  '89fbafec-0742-4097-95ce-c5cd17dd4e3e',
  '6b4ba4b8-014e-4ba7-ba6e-edf750d5d4b8',
  '5a231bb5-f6aa-4683-b686-24875b31ec76',
  '270c84ad-5f0f-47b3-9cd5-a21b51eeddc5',
  '3b391265-d8d3-415e-af16-601415094470',
  'cca52208-505a-41bf-8e9e-fc1546e1ed00'],
 'count': 11}

# How to add a Live trace to our new dataset

In [6]:
from langsmith import traceable
from openai import OpenAI
from typing import List

openai_client = OpenAI()

@traceable(run_type="chain")
def retrieve_documents(question: str):
    return retriever.invoke(question)

@traceable(run_type="chain")
def generate_response(question: str, documents):
    formatted_docs = "\n\n".join(doc.page_content for doc in documents)
    rag_system_prompt = """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the latest question in the conversation. 
    If you don't know the answer, just say that you don't know. 
    Use three sentences maximum and keep the answer concise.
    """
    messages = [
        {
            "role": "system",
            "content": rag_system_prompt
        },
        {
            "role": "user",
            "content": f"Context: {formatted_docs} \n\n Question: {question}"
        }
    ]
    return call_openai(messages)

@traceable(run_type="llm")
def call_openai(
    messages: List[dict], model: str = os.environ["OPENAI_MODEL"], temperature: float = 0.0
) -> str:
    return openai_client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )

@traceable(run_type="chain")
def langsmith_rag(question: str):
    documents = retrieve_documents(question)
    response = generate_response(question, documents)
    return response.choices[0].message.content


In [7]:
question = "What is Hugging accelerator and how do I use it?"
langsmith_rag(question)

"Hugging Face's Accelerate is a library designed to simplify the process of training models in a distributed environment, whether on multiple GPUs in one machine or across several machines. To use it, you need to install the library, create an `Accelerator` object, and prepare your training objects (like DataLoaders, model, and optimizer) using the `accelerator.prepare` method. Additionally, replace the standard `loss.backward()` call in your training loop with `accelerator.backward(loss)` to enable distributed training."