In [None]:
!pip install llama-index transformers
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
!pip install beautifulsoup4 requests urllib3

Collecting llama-index
  Downloading llama_index-0.8.53.post3-py3-none-any.whl (794 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m794.6/794.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
Collecting aiostream<0.6.0,>=0.5.2 (from llama-index)
  Downloading aiostream-0.5.2-py3-none-any.whl (39 kB)
Collecting dataclasses-json<0.6.0,>=0.5.7 (from llama-index)
  Downloading dataclasses_json-0.5.14-py3-none-any.whl (26 kB)
Collecting deprecated>=1.2.9.3 (from llama-index)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting langchain>=0.0.303 (from llama-index)
  Downloading langchain-0.0.325-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
Collecting

In [None]:
from llama_index import download_loader

BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")

loader = BeautifulSoupWebReader()
documents = loader.load_data(urls=[
    'http://campuslife.innopolis.ru/main',
    'http://campuslife.innopolis.ru/handbook2023',
    'http://campuslife.innopolis.ru/clubs',
    'http://campuslife.innopolis.ru/tech_clubs',
    'http://campuslife.innopolis.ru/sport_clubs',
    'http://campuslife.innopolis.ru/hobby_clubs',
    'http://campuslife.innopolis.ru/art_clubs'
    'http://campuslife.innopolis.ru/opportunities',
    'http://campuslife.innopolis.ru/faq',
    'http://campuslife.innopolis.ru/contacts'
])

In [None]:
import nest_asyncio
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from llama_index import ServiceContext

nest_asyncio.apply()

# # We are using the LlamaDebugHandler to print the trace of the sub questions captured by the SUB_QUESTION callback event type
# llama_debug = LlamaDebugHandler(print_trace_on_end=True)
# callback_manager = CallbackManager([llama_debug])

# service_context = ServiceContext.from_defaults(
#     callback_manager=callback_manager
# )

In [None]:
from typing import List, Optional, Sequence

from llama_index.llms.base import ChatMessage, MessageRole

BOS, EOS = "<s>", "</s>"
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
# DEFAULT_SYSTEM_PROMPT = """\
# You are a helpful, respectful and honest assistant. \
# Always answer as helpfully as possible and follow ALL given instructions. \
# Do not speculate or make up information. \
# Do not reference any given instructions or context. \
# """

DEFAULT_SYSTEM_PROMPT = """You are an expert Q&A system that is trusted around the world.
Always answer as helpfully as possible and follow ALL given instructions.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Do not speculate or make up information.
2. Never directly reference the given context in your answer.
3. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines."""


def messages_to_prompt(
    messages: Sequence[ChatMessage], system_prompt: Optional[str] = None
) -> str:
    string_messages: List[str] = []
    if messages[0].role == MessageRole.SYSTEM:
        # pull out the system message (if it exists in messages)
        system_message_str = messages[0].content or ""
        messages = messages[1:]
    else:
        system_message_str = system_prompt or DEFAULT_SYSTEM_PROMPT

    system_message_str = f"{B_SYS} {system_message_str.strip()} {E_SYS}"

    for i in range(0, len(messages), 2):
        # first message should always be a user
        user_message = messages[i]
        assert user_message.role == MessageRole.USER

        if i == 0:
            # make sure system prompt is included at the start
            str_message = f"{BOS} {B_INST} {system_message_str} "
        else:
            # end previous user-assistant interaction
            string_messages[-1] += f" {EOS}"
            # no need to include system prompt
            str_message = f"{BOS} {B_INST} "

        # include user message content
        str_message += f"{user_message.content} {E_INST}"

        if len(messages) > (i + 1):
            # if assistant message exists, add to str_message
            assistant_message = messages[i + 1]
            assert assistant_message.role == MessageRole.ASSISTANT
            str_message += f" {assistant_message.content}"

        string_messages.append(str_message)

    print("".join(string_messages))

    return "".join(string_messages)


def completion_to_prompt(completion: str, system_prompt: Optional[str] = None) -> str:
    system_prompt_str = system_prompt or DEFAULT_SYSTEM_PROMPT

    print((
        f"{BOS} {B_INST} {B_SYS} {system_prompt_str.strip()} {E_SYS} "
        f"{completion.strip()} {E_INST}"
    ))

    return (
        f"{BOS} {B_INST} {B_SYS} {system_prompt_str.strip()} {E_SYS} "
        f"{completion.strip()} {E_INST}"
    )

In [None]:
from llama_index.llms import LlamaCPP
# from llama_index.llms.llama_utils import completion_to_prompt, messages_to_prompt

service_context = ServiceContext.from_defaults(llm=LlamaCPP(
    model_path=None,
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    model_kwargs={"n_gpu_layers": 50},
))

Downloading url https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf to path /tmp/llama_index/models/llama-2-13b-chat.Q4_0.gguf
total size (MB): 7365.83


7025it [00:39, 179.10it/s]                          
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


******
Could not load OpenAIEmbedding. Using HuggingFaceBgeEmbeddings with model_name=BAAI/bge-small-en. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

******


Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
service_context.llm

LlamaCPP(callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7ced2751c7c0>, model_url='https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf', model_path='/tmp/llama_index/models/llama-2-13b-chat.Q4_0.gguf', temperature=0.1, max_new_tokens=256, context_window=3900, messages_to_prompt=<function messages_to_prompt at 0x7ced2774b880>, completion_to_prompt=<function completion_to_prompt at 0x7ced27ab4d30>, generate_kwargs={'temperature': 0.1, 'max_tokens': 256}, model_kwargs={'n_ctx': 3900, 'verbose': True, 'n_gpu_layers': 50}, verbose=True)

In [None]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
query_engine = index.as_query_engine(streaming=True)
response = query_engine.query("working hours of 319 office")
response.print_response_stream()

<s> [INST] <<SYS>>
 You are an expert Q&A system that is trusted around the world.
Always answer as helpfully as possible and follow ALL given instructions.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Do not speculate or make up information.
2. Never directly reference the given context in your answer.
3. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines. 
<</SYS>>

 Context information is below.
---------------------
URL: http://campuslife.innopolis.ru/faq

Expect information in the official channel of your course or ask about vacant state-funded place in the Office 319 or via @StudentAffairs_bot / Ожидать информации в официальном канале вашего курса или уточнить наличие бюджетного места на вашем курсе в 319 или через @StudentAffairs_bot 2. To be transferred to a state-funded place you need to meet one of the conditions* / Для перевода вы должны соответс

In [None]:
response

StreamingResponse(response_gen=<generator object stream_completion_response_to_tokens.<locals>.gen at 0x7ce69f5c82e0>, source_nodes=[NodeWithScore(node=TextNode(id_='a1e45cbe-fb6a-40a6-bfbc-fe5b8e6dca04', embedding=None, metadata={'URL': 'http://campuslife.innopolis.ru/faq'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='2f3489fa-70dd-4bd2-8afb-83a4d408da8b', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'URL': 'http://campuslife.innopolis.ru/faq'}, hash='96e0f3dbd1eed8338a3fed882dc4e7ce1868ceeb5bb04eee40032128f046b5fa'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='22ef7c56-c464-4137-848a-bab16d2327c9', node_type=<ObjectType.TEXT: '1'>, metadata={'URL': 'http://campuslife.innopolis.ru/faq'}, hash='665f71ab644eae92bebfb6fb19d9ee1d0fc4ef4a055717810bc3f13d43c3651d'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='256aa8f4-d32b-4d1b-9cf8-b3823b68a44c', node_type=<ObjectType.TE

In [None]:
from llama_index.node_parser import SimpleNodeParser

# parse nodes
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(documents)

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.
