step1: tien xu ly file pdf txt


In [1]:
import re
import os
from unstructured.partition.pdf import partition_pdf

def format_text(text: str) -> str:
    """Định dạng lại văn bản theo quy tắc mong muốn."""
    lines = text.split('\n')
    formatted_lines = []
    current_paragraph = []
    previous_line_numbered = False

    for line in lines:
        stripped_line = line.strip()

        # Loại bỏ các dòng trống liên tiếp
        if not stripped_line:
            if current_paragraph:  # Nếu có đoạn văn, kết thúc đoạn văn hiện tại
                formatted_lines.append(' '.join(current_paragraph))
                current_paragraph = []
            continue  # Bỏ qua dòng trống

        # Xử lý các dòng bắt đầu bằng số hoặc ký tự a), b), c)...
        if re.match(r'^(\d+\.|\w\))', stripped_line):
            if current_paragraph:
                formatted_lines.append(' '.join(current_paragraph))
                current_paragraph = []
            current_paragraph.append(stripped_line)
            previous_line_numbered = True
        # Xử lý các dòng bắt đầu bằng "Điều"
        elif stripped_line.startswith('Điều'):
            if current_paragraph:
                formatted_lines.append(' '.join(current_paragraph))
                current_paragraph = []
            formatted_lines.append(stripped_line)
            previous_line_numbered = False
        # Xử lý các dòng còn lại
        elif stripped_line:
            if previous_line_numbered and not stripped_line[0].isupper():
                if current_paragraph:  # Kiểm tra nếu current_paragraph có phần tử
                    current_paragraph[-1] += ' ' + stripped_line
                else:
                    current_paragraph.append(stripped_line)  # Nếu không có phần tử, thêm vào
            else:
                current_paragraph.append(stripped_line)
            previous_line_numbered = False

    # Nếu còn đoạn văn chưa được thêm vào
    if current_paragraph:
        formatted_lines.append(' '.join(current_paragraph))

    # Kết hợp các đoạn văn thành văn bản
    result_text = '\n'.join(formatted_lines)

    # Loại bỏ các dòng trống giữa các đoạn văn
    result_text = re.sub(r'\n+', '\n', result_text)

    return result_text


def extract_text_from_pdf(pdf_path: str) -> str:
    """Trích xuất văn bản từ file PDF."""
    try:
        elements = partition_pdf(pdf_path, strategy="fast")
        return '\n'.join(e.text if hasattr(e, "text") else e for e in elements)
    except Exception as e:
        raise RuntimeError(f"Lỗi khi trích xuất text từ PDF: {e}")

def process_pdf(pdf_path: str, output_folder: str) -> None:
    """Xử lý và lưu nội dung tệp PDF đã định dạng."""
    try:
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"Không tìm thấy file PDF: {pdf_path}")

        print(f"Đang xử lý file PDF: {pdf_path}")
        extracted_text = extract_text_from_pdf(pdf_path)

        formatted_text = format_text(extracted_text)
        
        # Lấy tên file PDF (không bao gồm phần mở rộng)
        pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
        output_path = os.path.join(output_folder, f"{pdf_filename}.txt")

        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(formatted_text)
        
        print(f"Đã xử lý và lưu file thành công vào {output_path}")
    except Exception as e:
        print(f"Đã xảy ra lỗi trong quá trình xử lý PDF: {e}")

def main():
    pdf_folder = r"D:\DATN\QA_System\casestudy\pdf"
    output_folder = r"D:\DATN\QA_System\casestudy\pdf"
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            process_pdf(pdf_path, output_folder)

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


Đang xử lý file PDF: D:\DATN\QA_System\casestudy\pdf\20230710 1. QĐ Học bổng KKHT 2023.pdf
Đã xử lý và lưu file thành công vào D:\DATN\QA_System\casestudy\pdf\20230710 1. QĐ Học bổng KKHT 2023.txt
Đang xử lý file PDF: D:\DATN\QA_System\casestudy\pdf\20230710 2. QĐ Học bổng Trần Đại Nghĩa 2023.pdf
Đã xử lý và lưu file thành công vào D:\DATN\QA_System\casestudy\pdf\20230710 2. QĐ Học bổng Trần Đại Nghĩa 2023.txt


Step2. load and chunking + raptor

In [1]:
import os
import pandas as pd
from data_ingestion import TXTProcessor  # step1_loaddata
from chunking import text_splitter  # step2_chunking
from raptor import RaptorPipeline  # step3_RAPTOR
from utils import convert_df_to_documents
from model_config import VietnameseEmbeddings
from langchain_community.document_loaders import TextLoader
from typing import List
from tqdm import tqdm
import glob
import json
# Các bước pipeline
def step1_load_data(txt_file: str) -> List:
    """
    Load dữ liệu từ một file .txt cụ thể.
    """
    processor = TXTProcessor()
    documents = processor.setup_txt(txt_file)
    print(f"Đã load dữ liệu từ file: {txt_file}")
    return documents

def step2_chunking(documents):
    """
    Chunk tài liệu thành các phần nhỏ hơn.
    """
    chunks = text_splitter.split_documents(documents)
    for i, chunk in enumerate(chunks):
        chunk.metadata["id"] = str(i)
    chunks_metadata = [chunk.metadata for chunk in chunks]
    chunks_content = [chunk.page_content for chunk in chunks]
    return chunks_metadata, chunks_content

def step3_RAPTOR(chunks_metadata, chunks_content):
    """
    Thực hiện RAPTOR pipeline trên chunks.
    """
    raptor = RaptorPipeline()
    results = raptor.recursive_embed_cluster_summarize(chunks_content, chunks_metadata, level=1, n_levels=3)
    final_df = raptor.build_final_dataframe(results)
    return final_df

def process_directory(directory: str):
    """
    Đọc và xử lý toàn bộ các file trong thư mục với RAPTOR.
    """
    processor = TXTProcessor(directory=directory)
    txt_files = processor.get_txt_files()
    
    for txt_file in tqdm(txt_files, desc="Processing files"):
        # Bước 1: Load dữ liệu
        documents = step1_load_data(txt_file)
        
        # Bước 2: Chunking
        chunks_metadata, chunks_content = step2_chunking(documents)
        
        # Bước 3: RAPTOR
        final_df = step3_RAPTOR(chunks_metadata, chunks_content)
        
        # Lưu kết quả thành file CSV
        output_csv = f"{os.path.splitext(txt_file)[0]}_results.csv"
        final_df["metadata"] = final_df["metadata"].apply(lambda x: json.dumps(x, ensure_ascii=False))
        final_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
        print(f"Đã lưu kết quả RAPTOR cho file {txt_file} vào {output_csv}")
        # # Lưu kết quả thành file Pickle
        # output_pkl = f"{os.path.splitext(txt_file)[0]}_results.pkl"
        # final_df.to_pickle(output_pkl)  # Lưu DataFrame dưới dạng Pickle
        # print(f"Đã lưu kết quả RAPTOR cho file {txt_file} vào {output_pkl}")
# Thực thi pipeline
if __name__ == "__main__":
    directory_path = r"D:\DATN\QA_System\casestudy\pdf"  # Thay bằng đường dẫn thực tế
    process_directory(directory_path)


  from .autonotebook import tqdm as notebook_tqdm
Processing files:   0%|          | 0/2 [00:00<?, ?it/s]

Đã load dữ liệu từ file: D:\DATN\QA_System\casestudy\pdf\20230710 1. QĐ Học bổng KKHT 2023.txt
Initializing RaptorPipeline...
Initializing Vietnamese embedding model: dangvantuan/vietnamese-embedding
Embedding model loaded.
Summarization model loaded.
Tokenizer loaded.
Starting recursive_embed_cluster_summarize for level 1...
Processing base level (level 0)...
Starting embed_cluster_summarize for level 0...
Getting clusters...
Embedding texts...
Texts embedded. Number of embeddings: 24




Clusters obtained. Number of clusters: 5
Generating summaries...


Summarizing clusters (level 0): 100%|██████████| 5/5 [00:31<00:00,  6.29s/it]


Summaries generated.
Finished embed_cluster_summarize for level 0.
Base level processing completed.
Processing level 1...
Starting embed_cluster_summarize for level 1...
Getting clusters...
Embedding texts...
Texts embedded. Number of embeddings: 24




Clusters obtained. Number of clusters: 5
Generating summaries...


Summarizing clusters (level 1): 100%|██████████| 5/5 [00:30<00:00,  6.06s/it]


Summaries generated.
Finished embed_cluster_summarize for level 1.
Level 1 processing completed.
Recursing to level 2...
Starting recursive_embed_cluster_summarize for level 2...
Processing level 2...
Starting embed_cluster_summarize for level 2...
Getting clusters...
Embedding texts...
Texts embedded. Number of embeddings: 5
Clusters obtained. Number of clusters: 1
Generating summaries...


Summarizing clusters (level 2): 100%|██████████| 1/1 [00:06<00:00,  6.47s/it]


Summaries generated.
Finished embed_cluster_summarize for level 2.
Level 2 processing completed.
Finished recursive_embed_cluster_summarize for level 2.
Recursion to level 2 completed.
Finished recursive_embed_cluster_summarize for level 1.
Building final dataframe...
Processing chunks (level 0)...


Processing chunks: 100%|██████████| 24/24 [00:00<?, ?it/s]


Processing summaries (levels > 0)...


Processing summary levels: 100%|██████████| 2/2 [00:00<00:00, 121.94it/s]
Processing files:  50%|█████     | 1/2 [01:29<01:29, 89.34s/it]

Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Final dataframe built.
Đã lưu kết quả RAPTOR cho file D:\DATN\QA_System\casestudy\pdf\20230710 1. QĐ Học bổng KKHT 2023.txt vào D:\DATN\QA_System\casestudy\pdf\20230710 1. QĐ Học bổng KKHT 2023_results.csv
Đã load dữ liệu từ file: D:\DATN\QA_System\casestudy\pdf\20230710 2. QĐ Học bổng Trần Đại Nghĩa 2023.txt
Initializing RaptorPipeline...
Embedding model loaded.
Summarization model loaded.
Tokenizer loaded.
Starting recursive_embed_cluster_summarize for level 1...
Processing base level (level 0)...
Starting embed_cluster_summarize for level 0...
Getting clusters...
Embedding texts...
Texts embedded. Number of embeddings: 37




Clusters obtained. Number of clusters: 8
Generating summaries...


Summarizing clusters (level 0): 100%|██████████| 8/8 [00:49<00:00,  6.18s/it]


Summaries generated.
Finished embed_cluster_summarize for level 0.
Base level processing completed.
Processing level 1...
Starting embed_cluster_summarize for level 1...
Getting clusters...
Embedding texts...
Texts embedded. Number of embeddings: 37




Clusters obtained. Number of clusters: 8
Generating summaries...


Summarizing clusters (level 1): 100%|██████████| 8/8 [00:48<00:00,  6.12s/it]


Summaries generated.
Finished embed_cluster_summarize for level 1.
Level 1 processing completed.
Recursing to level 2...
Starting recursive_embed_cluster_summarize for level 2...
Processing level 2...
Starting embed_cluster_summarize for level 2...
Getting clusters...
Embedding texts...
Texts embedded. Number of embeddings: 8
Clusters obtained. Number of clusters: 1
Generating summaries...


Summarizing clusters (level 2): 100%|██████████| 1/1 [00:07<00:00,  7.69s/it]


Summaries generated.
Finished embed_cluster_summarize for level 2.
Level 2 processing completed.
Finished recursive_embed_cluster_summarize for level 2.
Recursion to level 2 completed.
Finished recursive_embed_cluster_summarize for level 1.
Building final dataframe...
Processing chunks (level 0)...


Processing chunks: 100%|██████████| 37/37 [00:00<?, ?it/s]


Processing summaries (levels > 0)...


Processing summary levels: 100%|██████████| 2/2 [00:00<00:00, 81.98it/s]
Processing files: 100%|██████████| 2/2 [03:26<00:00, 103.22s/it]

Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Aggregating metadata...
Final dataframe built.
Đã lưu kết quả RAPTOR cho file D:\DATN\QA_System\casestudy\pdf\20230710 2. QĐ Học bổng Trần Đại Nghĩa 2023.txt vào D:\DATN\QA_System\casestudy\pdf\20230710 2. QĐ Học bổng Trần Đại Nghĩa 2023_results.csv





ghep thanh 1 file 

In [3]:
import pandas as pd
import glob
import os

def merge_csv_files(input_directory: str, output_file: str):
    # Tạo danh sách tất cả các tệp CSV trong thư mục
    csv_files = glob.glob(os.path.join(input_directory, "*.csv"))
    
    # Đọc và ghép tất cả các tệp CSV lại thành một DataFrame
    df_list = []  # Danh sách để lưu các DataFrame
    for file in csv_files:
        df = pd.read_csv(file)  # Đọc tệp CSV
        df_list.append(df)  # Thêm DataFrame vào danh sách
    
    # Ghép tất cả các DataFrame lại thành một
    merged_df = pd.concat(df_list, ignore_index=True)  # ignore_index để không giữ chỉ số ban đầu
    print(f"Tất cả các tệp CSV đã được ghép vào {output_file}")
    # Lưu DataFrame đã ghép vào tệp CSV mới
    df = merged_df
    df.head()
    # Lưu lại DataFrame vào excel mới
    df.to_csv(output_file, index=False ,encoding='utf-8') 
    df.to_excel(execfile, index=False, engine="openpyxl")
    

# Sử dụng hàm
input_directory = r"D:\DATN\QA_System\casestudy\pdf"  # Thư mục chứa các tệp CSV
output_file = r"D:\DATN\QA_System\casestudy\pdf\merged_output.csv"  # Tên tệp CSV đầu ra
execfile = r"D:\DATN\QA_System\casestudy\pdf\df.xlsx"
merge_csv_files(input_directory, output_file)


Tất cả các tệp CSV đã được ghép vào D:\DATN\QA_System\casestudy\pdf\merged_output.csv


chuyen sang json de luu vao db cho nhanh

In [None]:
import pandas as pd

execfile = r"D:\DATN\QA_System\casestudy\pdf\finaldf.xlsx"
df = pd.read_excel(execfile)
import json

# Chuyển đổi cột metadata từ chuỗi JSON thành dictionary (nếu cần)
if isinstance(df["metadata"].iloc[0], str):  # Kiểm tra kiểu dữ liệu của phần tử đầu tiên trong cột metadata
    df["metadata"] = df["metadata"].apply(lambda x: json.loads(x))

# Chuyển đổi DataFrame thành danh sách các dictionary
json_data = df.to_dict(orient="records")

# Lưu danh sách các dictionary vào file JSON
output_json_file = r"D:\DATN\QA_System\casestudy\pdf\finaldf.json"  # Đường dẫn đến file JSON đầu ra
with open(output_json_file, "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=4)

print(f"Đã lưu dữ liệu vào file JSON: {output_json_file}")

loc theo level neu muon

In [14]:
import json

def filter_json_by_level(input_file, output_file, target_level=0):
    """
    Lọc các phần tử trong file JSON dựa trên giá trị của 'level'.

    Args:
        input_file: Đường dẫn đến file JSON đầu vào.
        output_file: Đường dẫn đến file JSON đầu ra (sẽ ghi đè lên file đầu vào nếu giống nhau).
        target_level: Giá trị 'level' cần giữ lại.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        print(f"Lỗi: Không tìm thấy file {input_file}")
        return
    except json.JSONDecodeError:
        print(f"Lỗi: File {input_file} không đúng định dạng JSON.")
        return

    filtered_data = [
        item for item in data
        if "level" in item and item["level"] == target_level
    ]

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(filtered_data, f, ensure_ascii=False, indent=4)

    print(f"Đã lọc và lưu dữ liệu vào file {output_file}")

# Sử dụng hàm:
input_json_file = r"D:\DATN\QA_System\casestudy\pdf\data.json"
output_json_file = r"D:\DATN\QA_System\casestudy\pdf\data0.json"
filter_json_by_level(input_json_file, output_json_file, target_level=0)

Đã lọc và lưu dữ liệu vào file D:\DATN\QA_System\casestudy\pdf\data0.json


In [6]:
from huggingface_hub import login

# Thay 'your_token_here' bằng token của bạn
login("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")


In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("dangvantuan/vietnamese-embedding")

sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium."
]
embeddings = model.encode(sentences)

similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

torch.Size([3, 3])


In [8]:
from elasticsearch import Elasticsearch, helpers
import json
from time import time
es = Elasticsearch(  #new 05/02/25
    "https://my-elasticsearch-project-acc1b0.es.ap-southeast-1.aws.elastic.cloud:443",
    api_key="SFJsUTFKUUJrYVpaM1NIdzdxNlo6ei1NVlA3azZTTy1RbllvMzBEY0kxUQ=="
)

# es = Elasticsearch(
#     "https://my-elasticsearch-project-fc9fd1.es.ap-southeast-1.aws.elastic.cloud:443",
#     api_key="aXY4NkpKUUJaNVNfUEdnZHdVZ186MExmVk1iclZRWWlrS1hpeDRhOWRGUQ=="
# )

from model_config import load_embedding_model_VN2, load_tokenizer2 , VietnameseEmbeddings
embeddings = load_embedding_model_VN2()
tokenizer = load_tokenizer2()
json_file_path = r"D:\DATN\QA_System\casestudy\pdf\faq_data.json"
index_name = "faq_data"
mapping = {
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "category": {"type": "keyword"},
            "vector": {"type": "dense_vector", "dims": 768, "similarity": "cosine", "index": True}  # Số chiều của vector embedding
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, body=mapping)

def generate_actions(documents):
    truncated = 0
    for i, record in enumerate(documents):
        try:
            # Get tokens
            tokens = tokenizer.encode(record["question"])
            
            # Truncate if needed
            if len(tokens) > 254:
                token = tokens[:236]  # Chỉ lấy 254 token đầu tiên
                record["question"] = tokenizer.decode(token)  # Chuyển lại thành text
                truncated += 1
                print(f"Truncated document {i} from {len(tokens)} tokens to 236 tokens")
                
            embedding = embeddings.embed_query(record["question"])
            print(f"Processing document {i}")
            yield {
                    "_op_type": "index",  # Chỉ mục bản ghi
                    "_index": index_name,
                    "_source": {
                        "question": record["question"],
                        "answer": record["answer"],
                        "category": record["category"],
                        "vector": embedding
                    }
    
            }
        except Exception as e:
            print(f"Error processing document {i}: {str(e)}")
            raise e
    print(f"Total documents truncated: {truncated}")


with open(json_file_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)

start_time = time()
helpers.bulk(es, generate_actions(json_data))
print(f"Indexing completed. Time taken: {time() - start_time:.2f} seconds")

Initializing Vietnamese embedding model: dangvantuan/vietnamese-embedding
Processing document 0
Processing document 1
Total documents truncated: 0
Indexing completed. Time taken: 0.55 seconds


In [9]:
from elasticsearch import Elasticsearch, helpers
import json
from time import time
es = Elasticsearch(  #new 05/02/25
    "https://my-elasticsearch-project-acc1b0.es.ap-southeast-1.aws.elastic.cloud:443",
    api_key="SFJsUTFKUUJrYVpaM1NIdzdxNlo6ei1NVlA3azZTTy1RbllvMzBEY0kxUQ=="
)


# es = Elasticsearch(
#     "https://my-elasticsearch-project-fc9fd1.es.ap-southeast-1.aws.elastic.cloud:443",
#     api_key="aXY4NkpKUUJaNVNfUEdnZHdVZ186MExmVk1iclZRWWlrS1hpeDRhOWRGUQ=="
# )

from model_config import load_embedding_model_VN2, load_tokenizer2
embeddings = load_embedding_model_VN2()
tokenizer = load_tokenizer2()
json_file_path = r"D:\DATN\QA_System\casestudy\pdf\finaldf.json"
index_name = "raptor"
mapping = {
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "metadata": {"type": "object"},
            "vector": {"type": "dense_vector", "dims": 768, "similarity": "cosine", "index": True}
        }
    }
}

if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
es.indices.create(index=index_name, body=mapping)

def generate_actions(documents):
    truncated = 0
    for i, record in enumerate(documents):
        try:
            # Get tokens
            tokens = tokenizer.encode(record["text"], add_special_tokens=False)
            
            # Truncate if needed
            if len(tokens) > 254:
                token = tokens[:236]  # Chỉ lấy 254 token đầu tiên
                record["text"] = tokenizer.decode(token)  # Chuyển lại thành text
                truncated += 1
                print(f"Truncated document {i} from {len(tokens)} tokens to 236 tokens")
                
            embedding = embeddings.embed_query(record["text"])
            print(f"Processing document {i}")
            yield {
                "_op_type": "index",
                "_index": index_name,
                "_source": {
                    "text": record["text"],
                    "metadata": record["metadata"],
                    "vector": embedding
                }
            }
        except Exception as e:
            print(f"Error processing document {i}: {str(e)}")
            raise e
    print(f"Total documents truncated: {truncated}")


with open(json_file_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)

start_time = time()
helpers.bulk(es, generate_actions(json_data))
print(f"Indexing completed. Time taken: {time() - start_time:.2f} seconds")

Processing document 0
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Truncated document 9 from 498 tokens to 236 tokens
Processing document 9
Processing document 10
Processing document 11
Processing document 12
Processing document 13
Processing document 14
Processing document 15
Processing document 16
Processing document 17
Processing document 18
Processing document 19
Processing document 20
Processing document 21
Processing document 22
Processing document 23
Processing document 24
Processing document 25
Processing document 26
Processing document 27
Processing document 28
Processing document 29
Processing document 30
Truncated document 31 from 280 tokens to 236 tokens
Processing document 31
Processing document 32
Processing document 33
Processing document 34
Processing document 35
Processing document 36
Processing document 37
Processing document 38
Processing

Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors


Processing document 55
Processing document 56
Truncated document 57 from 523 tokens to 236 tokens
Processing document 57
Processing document 58
Processing document 59
Processing document 60
Processing document 61
Processing document 62
Truncated document 63 from 295 tokens to 236 tokens
Processing document 63
Processing document 64
Processing document 65
Processing document 66
Truncated document 67 from 305 tokens to 236 tokens
Processing document 67
Processing document 68
Processing document 69
Processing document 70
Processing document 71
Processing document 72
Processing document 73
Processing document 74
Processing document 75
Processing document 76
Processing document 77
Processing document 78
Processing document 79
Processing document 80
Processing document 81
Processing document 82
Processing document 83
Processing document 84
Processing document 85
Processing document 86
Processing document 87
Processing document 88
Processing document 89
Processing document 90
Processing docum