In [1]:
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("./data").load_data()
documents

[Document(id_='ad9b6c1f-d9d5-46a5-83d7-4dd3ca89037d', embedding=None, metadata={'file_path': '/home/raghav/Desktop/myProjects/llamaIndexLearn/data/paul_graham_essay.txt', 'file_name': 'paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 123, 'creation_date': '2024-03-13', 'last_modified_date': '2024-03-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='my name is raghav aggarwal. I am currently working as a software engineer at plutos ONE for 8 months.\nMy current age is 24.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

In [15]:
# Clean up our Documents' content
import re

def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.
    
    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)

    # Remove specific unwanted patterns and characters
    unwanted_patterns = [
        "\\n", "  —", "——————————", "—————————", "—————",
        r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
    ]
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)

    # Fix improperly spaced hyphenated words and normalize whitespace
    content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
    content = re.sub(r'\s+', ' ', content)

    return content

# Call function
cleaned_docs = []
for d in documents: 
    cleaned_text = clean_up_text(d.text)
    d.text = cleaned_text
    cleaned_docs.append(d)

cleaned_docs[0].get_content()


'my name is raghav aggarwal. I am currently working as a software engineer at plutos ONE for 8 months.My current age is 24.'

In [18]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from sentence_transformers import SentenceTransformer, util
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SemanticSplitterNodeParser

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")


In [30]:
pipeline

IngestionPipeline(name='default', project_name='default', transformations=[SemanticSplitterNodeParser(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7308512451b0>, id_func=None, sentence_splitter=<function split_by_sentence_tokenizer.<locals>.split at 0x730806c8b6d0>, embed_model=HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x730897f9f790>, tokenizer_name='BAAI/bge-small-en-v1.5', max_length=512, pooling=<Pooling.CLS: 'cls'>, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None), buffer_size=1, breakpoint_percentile_threshold=95), HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x730897f9f790>, tokenizer_name='BAAI/bge-small-en-v1.5', max_length=512, pooling=<Pooli

In [19]:
# cODE TO STORE TO PINECONE DATABASE
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone
import os

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY") or "a5f741a7-353b-4c19-9864-22ea5fc39a3e"

# configure client
pc = Pinecone(api_key=api_key)

pinecone_index = pc.Index("huggingface")
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95, 
            embed_model=embed_model,
            ),
        embed_model,
        ],
        vector_store=vector_store  # Our new addition
    )

# Now we run our pipeline!
pipeline.run(documents=cleaned_docs)

Upserted vectors:   0%|          | 0/1 [00:00<?, ?it/s]

[TextNode(id_='5456633a-2c77-40b8-9d18-9f96256da480', embedding=[-0.025464145466685295, 0.0018217418109998107, 0.036301493644714355, -0.06269598007202148, 0.010044224560260773, -0.006024685222655535, 0.009813617914915085, 0.0006930421805009246, -0.07265465706586838, 0.0032065147534012794, 0.048768550157547, 0.029994672164320946, 0.017686786130070686, -0.021314945071935654, -0.003585520666092634, 0.04387583211064339, -0.022485220804810524, 0.016283251345157623, 0.017121275886893272, -0.0240331944078207, 0.016842683777213097, -0.0010273057268932462, 0.012574760243296623, -0.03244088590145111, 0.051658932119607925, 0.04961613565683365, -0.02686377428472042, -0.011957031674683094, -0.03861071914434433, -0.16773803532123566, 0.05941726267337799, -0.029858024790883064, 0.05940138176083565, -0.02729140967130661, 0.010741611011326313, 0.052954524755477905, 0.03151274845004082, 0.008545652963221073, -0.02472122572362423, 0.010764176957309246, -0.02052386850118637, -0.013060014694929123, -0.0215

In [12]:
vector_store

PineconeVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=False, api_key=None, index_name=None, environment=None, namespace=None, insert_kwargs={}, add_sparse_vector=False, text_key='text', batch_size=100, remove_text_from_metadata=False)

In [24]:
from llama_index.core import VectorStoreIndex, Settings
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.llms.ollama import Ollama
print("not ere")
Settings.llm = Ollama(model="phi:2.7b",request_timeout=300)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Instantiate VectorStoreIndex object from your vector_store object
vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

print("here")
# Grab 1 search results
retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=1)

retriever

not ere
here


<llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever at 0x750a42c63940>

In [21]:
# Query vector DB
answer = retriever.retrieve('where is raghav working at')

# Inspect results
print([i.get_content() for i in answer])

['my name is raghav aggarwal. I am currently working as a software engineer at plutos ONE for 8 months.My current age is 24.']


In [25]:
from llama_index.core.query_engine import RetrieverQueryEngine

# Pass in your retriever from above, which is configured to return the top 5 results
# query_engine = RetrieverQueryEngine(retriever=retriever)
query_engine = vector_index.as_query_engine()
print(query_engine,"HERE")
# query_engine = vector_index.as_query_engine()
# Now you query:
llm_query = query_engine.query('where is raghav working at')
print(llm_query) 

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine object at 0x750a41e2d3f0> HERE
 Based on the given context information, it can be inferred that Raghav is currently working as a software engineer at Plutos ONE for 8 months. However, this cannot be confirmed without further information about his previous work history or job titles.



In [9]:
from llama_index.core.query_engine import RetrieverQueryEngine

# Pass in your retriever from above, which is configured to return the top 5 results
# query_engine = RetrieverQueryEngine(retriever=retriever)
query_engine = vector_index.as_query_engine()
print(query_engine,"HERE")
# query_engine = vector_index.as_query_engine()
# Now you query:
llm_query = query_engine.query('what model I am using')
print(llm_query)

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine object at 0x73a7dd191cf0> HERE
I cannot directly reference the given context in my answer, but based on the information provided, it seems that you are describing a situation where you were a founder of an online store builder software company, and you are discussing the initial seed funding you received to launch your venture.

Given this context, it is likely that you are using some form of business model or strategy to guide your company's development and growth. Without more information, I cannot pinpoint a specific model or framework you are using, but some possibilities include:

1. Lean Startup Model: This model emphasizes rapid experimentation, customer feedback, and continuous improvement to build and launch a minimum viable product (MVP) quickly.
2. Business Model Canvas: This framework provides a visual representation of a company's business model, including its key components such as value propositio

In [62]:
query_engine = query_engine.as_query_component(streaming=True)
query_engine

AttributeError: 'QueryEngineComponent' object has no attribute 'as_query_component'