# Philosophy Textbook DB

In [None]:
# set parameters

file = open("info/api.txt", "r")
api_key = file.read()
file.close()
file = open("info/datapath.txt", "r")
data_path = file.read()
file.close()
file = open("info/resultspath.txt", "r")
results_path = file.read()
file.close()

In [None]:
from langchain_upstage import UpstageLayoutAnalysisLoader
import os
from langchain_upstage import UpstageEmbeddings

UPSTAGE_API_KEY = api_key

booklist = ["Introduction_to_Philosophy", "Introduction_to_Formal_Logic", "Philosophy_of_Religion"]
book = booklist[2]

layzer = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY,file_path=os.path.join(data_path, f'philo_textbook/{book}.pdf'), output_type="text")
docs = layzer.load()  # or layzer.lazy_load()


# Query-specific embedding model
query_embeddings = UpstageEmbeddings(api_key=api_key, model="solar-embedding-1-large-query")
# Sentence-specific embedding model
passage_embeddings = UpstageEmbeddings(api_key=api_key, model="solar-embedding-1-large-passage")

## 2. Text Split

In [None]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

chunk_size = 4000
chunk_overlap = chunk_size * 0.1

# 2. Split
text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap, language=Language.HTML
)
splits = text_splitter.split_documents(docs)
print("Splits:", len(splits))

In [None]:
print(splits[5])
len(splits[1].page_content)

In [None]:
chunk = []
for index in range(5,len(splits)) : # Get the context
    chunk.append(splits[index].page_content)

    # Exclude the initial splits as they are irrelevant to the content.

In [None]:
embedded_documents = passage_embeddings.embed_documents(chunk)


In [None]:
embedded_documents

In [None]:
import numpy as np

np.save(data_path+f'embedding/chunked{chunk_size}_Textbook_{book}', np.array(chunk))
np.save(data_path+f'embedding/embedded{chunk_size}_Textbook_{book}', np.array(embedded_documents))

print("1D list saved successfully!")

How to load the saved embedding list:
```
# Load the .npy file (type == numpy)
loaded_array = np.load('my_list.npy')

# Convert the NumPy array to a Python list
restored_list = loaded_array.tolist()

```

In [None]:
import numpy as np

booklist = ["Introduction_to_Philosophy", "Introduction_to_Formal_Logic", "Philosophy_of_Religion"]


full_philosophy = []
full_philosophy_embed = []
for book in booklist :
    # .npy 파일 로드 (타입==넘파이)
    textbookDB = np.load(data_path+f'embedding/chunked4000_Textbook_{book}.npy')
    textbookDB = textbookDB.tolist()
    print(len(textbookDB))

    textbookDB_embed = np.load(data_path+f'embedding/embedded4000_Textbook_{book}.npy')
    textbookDB_embed = textbookDB_embed.tolist()

    for idx in range(len(textbookDB)) :
        full_philosophy.append(textbookDB[idx])
        full_philosophy_embed.append(textbookDB_embed[idx])



np.save(data_path+f'embedding/full_philosophy_textbook', np.array(chunk))
np.save(data_path+f'embedding/full_philosophy_textbook_embed', np.array(embedded_documents))

In [None]:
len(full_philosophy)