In [1]:
# !pip install langchain rank_bm25 pypdf unstructured chromadb
# !pip install unstructured['pdf'] unstructured
# !apt-get install poppler-utils
# !apt-get install -y tesseract-ocr
# !apt-get install -y libtesseract-dev
# !pip install pytesseract
# !pip install -U langchain-community
# !pip install -U langchain-huggingface
# !pip install bitsandbytes
# !pip install -U bitsandbytes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.7).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


### Load the required Packages

In [2]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain.llms import HuggingFaceHub


from langchain.retrievers import BM25Retriever, EnsembleRetriever

import os

In [3]:
### Load the PDF file

In [4]:
file_path = "/content/Orca_paper.pdf"
data_file = UnstructuredPDFLoader(file_path)
docs = data_file.load()



In [5]:
print(docs[0].page_content)

3 2 0 2

n u J

5

] L C . s c [

1 v 7 0 7 2 0 . 6 0 3 2 : v i X r a

Orca: Progressive Learning from Complex

Explanation Traces of GPT-4

Subhabrata Mukherjee∗†, Arindam Mitra∗

Ganesh Jawahar, Sahaj Agarwal, Hamid Palangi, Ahmed Awadallah

Microsoft Research

Abstract

Recent research has focused on enhancing the capability of smaller models through imitation learning, drawing on the outputs generated by large foundation models (LFMs). A number of issues impact the quality of these models, ranging from limited imitation signals from shallow LFM outputs; small scale homogeneous training data; and most notably a lack of rigorous evaluation resulting in overestimating the small model’s capability as they tend to learn to imitate the style, but not the reasoning process of LFMs. To address these challenges, we develop Orca, a 13-billion parameter model that learns to imitate the reasoning process of LFMs. Orca learns from rich signals from GPT-4 including explanation traces; step-by-st

### Split Documents and Chunking

In [6]:
# create chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=800,
                                          chunk_overlap=100)
chunks = splitter.split_documents(docs)

In [7]:
chunks[0].page_content

'3 2 0 2\n\nn u J\n\n5\n\n] L C . s c [\n\n1 v 7 0 7 2 0 . 6 0 3 2 : v i X r a\n\nOrca: Progressive Learning from Complex\n\nExplanation Traces of GPT-4\n\nSubhabrata Mukherjee∗†, Arindam Mitra∗\n\nGanesh Jawahar, Sahaj Agarwal, Hamid Palangi, Ahmed Awadallah\n\nMicrosoft Research\n\nAbstract'

In [8]:
from langchain.embeddings.base import Embeddings
from typing import List
import torch
from transformers import AutoTokenizer, AutoModel

class LocalHFTransformerEmbedding(Embeddings):
    def __init__(self, model_name="BAAI/bge-base-en-v1.5"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = []
        for text in texts:
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
            with torch.no_grad():
                outputs = self.model(**inputs)
            pooled = self._mean_pooling(outputs, inputs['attention_mask'])
            embeddings.append(pooled[0].tolist())
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        return self.embed_documents([text])[0]


### VectorStore

In [9]:
embeddings = LocalHFTransformerEmbedding()

# Create the vector store
vectorstore = Chroma.from_documents(chunks, embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
vectorstore_retreiver = vectorstore.as_retriever(search_kwargs={"k": 3})

In [11]:
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k =  3

### Ensemble Retriever

In [12]:
ensemble_retriever = EnsembleRetriever(retrievers=[vectorstore_retreiver,
                                                   keyword_retriever],
                                       weights=[0.5, 0.5])

In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_id = "HuggingFaceH4/zephyr-7b-beta"


# Define the configuration for 4-bit quantization
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config
)

input_ids = tokenizer("Hello, how are you?", return_tensors="pt").input_ids.cuda()
output = model.generate(input_ids, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Hello, how are you?

I hope you’re doing well. I’m writing to you today to share some exciting news.

As you know, I’ve been working hard to build my coaching business, and I’m thrilled to announce that


### Prompt Template:

In [67]:
template = """
<|system|>>
You are a helpful AI Assistant that follows instructions extremely well.
Use the following context to answer user question.

Think step by step before answering the question. You will get a $100 tip if you provide correct answer.

CONTEXT: {context}
</s>
<|user|>
{query}
</s>
<|assistant|>
"""

In [68]:
def decode(query: str):

  # Get relevant documents
  relevant_documents = ensemble_retriever.get_relevant_documents(query)

  # Prepare Context
  context = ""
  for i in relevant_documents:
    context += i.page_content + "\n"

  # prompt template
  prompt_template = template.format(context=context, query=query)

  input_ids = tokenizer(prompt_template, return_tensors="pt").input_ids.cuda()
  output = model.generate(input_ids, max_new_tokens=50)
  text = tokenizer.decode(output[0], skip_special_tokens=True)
  answer = text.split("<|assistant|>")[-1].strip()
  print("----------------")
  print(answer)
  print("-----------------")
  return answer


In [69]:
decode(query="What is Orca?")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


----------------
Orca is an AI language model trained by OpenAI, a subsidiary of Microsoft, to imitate the performance of GPT-4, a powerful natural language processing model. Orca is designed to follow instructions accurately and can provide helpful
-----------------


'Orca is an AI language model trained by OpenAI, a subsidiary of Microsoft, to imitate the performance of GPT-4, a powerful natural language processing model. Orca is designed to follow instructions accurately and can provide helpful'