In [3]:

import weaviate
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import StorageContext
import os
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_index.storage.docstore.mongodb import MongoDocumentStore

In [13]:
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_MODEL = os.getenv('OPENAI_MODEL')
OPENAI_EMBED_MODEL = os.getenv('OPENAI_EMBED_MODEL')
TEMPERATURE_MODEL = os.getenv('TEMPERATURE_MODEL')

In [14]:
llm = OpenAI(
    api_key=OPENAI_API_KEY,
    model=OPENAI_MODEL,
    temperature=TEMPERATURE_MODEL
)
embed_model = OpenAIEmbedding(
    api_key=OPENAI_API_KEY,
    model=OPENAI_EMBED_MODEL
)

In [15]:
Settings.llms = llm
Settings.embed_model = embed_model

In [6]:
a = GeneralLoader()

In [7]:
link = a.load_data(['https://tuyensinh.uit.edu.vn/2024-phuong-thuc-tuyen-sinh-nam-2024'])

100%|██████████| 1/1 [00:00<00:00,  3.39it/s]


In [19]:
link

[Document(id_='4fa6d1df-c58c-4478-9233-27937870130e', embedding=None, metadata={'file_name': 'https://tuyensinh.uit.edu.vn/2024-phuong-thuc-tuyen-sinh-nam-2024', 'file_type': 'web_page'}, excluded_embed_metadata_keys=['file_name', 'file_type'], excluded_llm_metadata_keys=['file_name', 'file_type'], relationships={}, text=' T6, 15/12/2023 \\- 10:03\n\n\n\n**CHI TIẾT CÁC PHƯƠNG THỨC XÉT TUYỂN**\n-------------------------------------------\n\n\n**1\\.1\\.**\xa0**Phương thức 1: Tuyển thẳng và ưu tiên xét tuyển**\n\n\n**1\\.1\\.1\\.Xét tuyển thẳng theo quy định của Quy chế tuyển sinh (Điều 8\\) \\- Mã phương thức xét tuyển: 301**\n\n\n**\\+****Đối tượng**: Các thí sinh đủ điều kiện xét tuyển thẳng và ưu tiên xét tuyển theo quy định của Bộ GD\\&ĐT và theo thông báo xét tuyển thẳng, ưu tiên xét tuyển của trường Đại học Công nghệ Thông tin (Thí sinh tham dự kỳ thi chọn đội tuyển quốc gia dự thi Olympic quốc tế, thí sinh đạt giải Học sinh giỏi quốc gia, giải Khoa học kỹ thuật quốc gia, …).

In [7]:
pdf = a.load_data(['https://res.cloudinary.com/djlo6r396/image/upload/v1722407000/uit-admin-bot/pdf/CV_Fullstack_Developer_TranTuanKiet_d3cddc.pdf'])

100%|██████████| 1/1 [00:00<00:00,  1.18it/s]


In [8]:
pdf

[Document(id_='2dc2b2e5-cc95-4ddb-95b8-3cbd98421927', embedding=None, metadata={'file_name': 'https://res.cloudinary.com/djlo6r396/image/upload/v1722407000/uit-admin-bot/pdf/CV_Fullstack_Developer_TranTuanKiet_d3cddc.pdf', 'file_type': 'web_page'}, excluded_embed_metadata_keys=['file_name', 'file_type'], excluded_llm_metadata_keys=['file_name', 'file_type'], relationships={}, text='', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

In [8]:
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents=link)

In [21]:
nodes

[TextNode(id_='1031dbc6-7494-4ed1-ad42-07b5d612635d', embedding=None, metadata={'file_name': 'https://tuyensinh.uit.edu.vn/2024-phuong-thuc-tuyen-sinh-nam-2024', 'file_type': 'web_page'}, excluded_embed_metadata_keys=['file_name', 'file_type'], excluded_llm_metadata_keys=['file_name', 'file_type'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='4fa6d1df-c58c-4478-9233-27937870130e', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_name': 'https://tuyensinh.uit.edu.vn/2024-phuong-thuc-tuyen-sinh-nam-2024', 'file_type': 'web_page'}, hash='5f1a797f9e1fa1c6f41086e098cffbf18df17a339c294f9c972013705ff2376a'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='2635d129-d53f-4ec2-84c8-62b5c95da05e', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='3dd5ea91bd5248cb556eb00134807e33d5d0db2050eada1c7888c01fb1c8877d')}, text='T6, 15/12/2023 \\- 10:03\n\n\n\n**CHI TIẾT CÁC PHƯƠNG THỨC XÉT TUYỂN**\n-------------------------------------------\n\n\n**1\\.1\\.*

In [4]:
client = weaviate.connect_to_local()
client.collections.delete("DSC2024")

In [55]:
documents

[Document(id_='http://paulgraham.com/worked.html', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='![](https://s.turbifycdn.com/aah/paulgraham/essays-5.gif)|\n![](https://sep.turbifycdn.com/ca/Img/trans_1x1.gif)|\n[![](https://s.turbifycdn.com/aah/paulgraham/essays-6.gif)](index.html)  \n  \n| ![What I Worked On](https://s.turbifycdn.com/aah/paulgraham/what-i-worked-\non-4.gif)  \n  \nFebruary 2021  \n  \nBefore college the two main things I worked on, outside of school, were\nwriting and programming. I didn\'t write essays. I wrote what beginning writers\nwere supposed to write then, and probably still are: short stories. My stories\nwere awful. They had hardly any plot, just characters with strong feelings,\nwhich I imagined made them deep.  \n  \nThe first programs I tried writing were on the IBM 1401 that our school\ndistrict used for what was then called "data processing." This was in 9th\ngrade, so I was 13 or 1

In [10]:
vector_store = WeaviateVectorStore(weaviate_client=client)
storage_context = StorageContext.from_defaults(docstore=MongoDocumentStore.from_uri(
            ), vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context
)

In [16]:
index.insert_nodes(nodes)
storage_context.docstore.add_documents(nodes)

PermissionDeniedError: Error code: 403 - {'error': {'message': 'Project `proj_JI6Ei76fw9d5qTdBXO8K7JZm` does not have access to model `text-embedding-ada-002`', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

In [44]:
storage_context.docstore.docs

{'39438402-3b9c-4d17-99f1-f797ed0175ee': TextNode(id_='39438402-3b9c-4d17-99f1-f797ed0175ee', embedding=None, metadata={'file_path': 'D:\\UIT\\DSC-2024\\DSC2024\\notebook\\CV_Fullstack_Developer_TranTuanKiet_d3cddc.pdf', 'file_name': 'CV_Fullstack_Developer_TranTuanKiet_d3cddc.pdf', 'file_type': 'application/pdf', 'file_size': 49345, 'creation_date': '2024-08-01', 'last_modified_date': '2024-07-31'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='33d771da-e1bf-48fa-8fc6-76a2c8608515', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'D:\\UIT\\DSC-2024\\DSC2024\\notebook\\CV_Fullstack_Developer_TranTuanKiet_d3cddc.pdf', 'file_name': 'CV_Fullstack_Developer_TranTuanKiet_d3cddc.pdf', 'file_type': 

In [28]:
query_engine = index.as_query_engine(similarity_top_k=2)
response = query_engine.query("Phương thức xét tuyển 1 có tên là gì?")
print(response)

Thông tin về phương thức xét tuyển không được đề cập trong tài liệu.


In [43]:

from llama_index.core import SummaryIndex
from llama_index.readers.web import SimpleWebPageReader
from IPython.display import Markdown, display
import os

In [54]:
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["http://paulgraham.com/worked.html", 'https://tuyensinh.uit.edu.vn/2024-phuong-thuc-tuyen-sinh-nam-2024'],   
)

In [51]:
parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(documents=documents)

In [52]:
nodes

[TextNode(id_='e43dffc3-3c5a-422c-a9ad-513c267557e0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='http://paulgraham.com/worked.html', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='e2ba8a553c443190360ba0524bca33468296ec914933ddd248e2d11c9748cf1c'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='22127a38-8738-4ea3-9202-02ec3956fced', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='86f10286df3a2ec46603235d8d2e11d79040f465f9024226abac980216e15cb5')}, text='![](https://s.turbifycdn.com/aah/paulgraham/essays-5.gif)|\n![](https://sep.turbifycdn.com/ca/Img/trans_1x1.gif)|\n[![](https://s.turbifycdn.com/aah/paulgraham/essays-6.gif)](index.html)  \n  \n| ![What I Worked On](https://s.turbifycdn.com/aah/paulgraham/what-i-worked-\non-4.gif)  \n  \nFebruary 2021  \n  \nBefore college the two main things I worked on, outside of school, were\nwriting and pro