# Download dataset

In [1]:
from datasets import load_dataset

# dataset link: https://huggingface.co/datasets/hugfaceguy0001/stanford_plato
dataset = load_dataset("hugfaceguy0001/stanford_plato", cache_dir="../data/stanford/")

In [2]:
# To save time we select 300 items
# Ends with article "decision theory"
dataset_subset = dataset['train'].select(range(300)) 

This dataset includes 1776 articles, each explaining one philosophy term/people/topic. It has 8 features:

* shorturl: The shorturl for the article. For example, the shorturl 'abduction' correspond to the page https://plato.stanford.edu/entries/abduction/
* title: The title of the article.
* pubinfo: The publication information.
* preamble: The preface text of the article. The data is a list, each item of the list is a paragraph of the data. I choose not to break the paragraph structure. Certainly, you can merge them by, for example, ''.join(data['preamble'])
toc: Table of contents. Also represented as list. Each item is a dictionary, the 'content_title' is the main content title, and the 'sub_toc' is a list of subcontent titles.
* main_text: The main text of the article. The data is also a list, each item represents a section of the article. Each item is a dictionary, 'section_title' is the title of the section, 'main_content' is a list of paragraphs before subsections, 'subsections' is a list of subsections, each item is also a dictionary, has its own title 'subsection_title' and list of paragraphs 'content'.
* bibliography: list of bibliography.
* related_entries: list of entries related to the current entry.

In [3]:
df = dataset_subset.to_pandas()
df.to_csv("../data/stanford_plato_train.csv", index=False)

In [4]:
df

Unnamed: 0,shorturl,title,pubinfo,preamble,toc,main_text,bibliography,related_entries
0,abduction,Abduction,"First published Wed Mar 9, 2011; substantive r...","[\nIn the philosophical literature, the term “...",[{'content_title': '1. Abduction: The General ...,[{'main_content': [' You happen to know that T...,"[Achinstein, P., 2001. The Book of Evidence, O...","[{'href': '../epistemology-bayesian/', 'text':..."
1,abelard,Peter Abelard,"First published Tue Aug 3, 2004; substantive r...",[\nPeter Abelard (1079–21 April 1142) [‘Abaila...,"[{'content_title': '1. Life and Works', 'sub_t...","[{'main_content': [], 'section_title': '1. Lif...",[Carmen ad Astralabium. Edited by J. M. A.\nRu...,"[{'href': '../aristotle-logic/', 'text': 'Aris..."
2,abhidharma,Abhidharma,"First published Mon Aug 16, 2010; substantive ...",[\nThe first centuries after Śākyamuni Buddha’...,[{'content_title': '1. Abhidharma: its origins...,[{'main_content': [' The early history of Budd...,"[[A] Aṅguttara-nikāya, [Dhs-a] Atthasālinī\n(D...","[{'href': '../atomism-modern/', 'text': 'atomi..."
3,abilities,Abilities,"First published Tue Jan 26, 2010; substantive ...","[\nIn the accounts we give of one another, cla...","[{'content_title': '1. A taxonomy', 'sub_toc':...",[{'main_content': [' What is an ability? On on...,"[Albritton, Rogers, 1985. “Freedom of Will and...","[{'href': '../action/', 'text': 'action'}, {'h..."
4,abner-burgos,Abner of Burgos,"First published Mon Jul 9, 2012; substantive r...",[\nAbner of Burgos (Alfonso de Valladolid; c. ...,"[{'content_title': '1. Life', 'sub_toc': []}, ...",[{'main_content': [' There are not many source...,"[Meyasher aqob, G. Gluskina (ed. and trans.), ...","[{'href': '../aristotle-natphil/', 'text': 'Ar..."
...,...,...,...,...,...,...,...,...
295,abraham-daud,Abraham Ibn Daud,"First published Sat Aug 26, 2006; substantive ...",[\n\nAbraham ibn Daud (c.1110–1180) can be reg...,"[{'content_title': '1. Introduction', 'sub_toc...",[{'main_content': [' In the introduction to h...,"[Cohen, G.D. (ed.), 1967, A critical edition w...","[{'href': '../arabic-islamic-judaic/', 'text':..."
296,david,David,"First published Mon Sep 8, 2003; substantive r...",[\n\n‘David’ is named in certain manuscripts o...,"[{'content_title': '1. Introduction', 'sub_toc...",[{'main_content': [' Byzantium in the 6th cen...,"[Aversatjan, S., 1981. “David l’Invincible et ...","[{'href': '../ammonius/', 'text': 'Ammonius'},..."
297,davidson,Donald Davidson,"First published Wed May 29, 1996; substantive ...",[\nDonald Davidson was one of the most importa...,"[{'content_title': '1. Biographical Sketch', '...",[{'main_content': [' Donald Herbert Davidson w...,"[1957, Decision-Making: An Experimental Approa...","[{'href': '../action/', 'text': 'action'}, {'h..."
298,death,Death,"First published Wed May 22, 2002; substantive ...",[\nThis article considers several questions co...,"[{'content_title': '1. Life', 'sub_toc': ['1.1...",[{'main_content': [' To die is to cease to be ...,"[Altshuler, R., 2016. “Immortality, Identity, ...","[{'href': '../death-definition/', 'text': 'dea..."


In [5]:
example_article = dataset_subset[0]
example_article

{'shorturl': 'abduction',
 'title': 'Abduction',
 'pubinfo': 'First published Wed Mar 9, 2011; substantive revision Tue May 18, 2021',
 'preamble': ['\nIn the philosophical literature, the term “abduction” is\nused in two related but different senses. In both senses, the term\nrefers to some form of explanatory reasoning. However, in the\nhistorically first sense, it refers to the place of explanatory\nreasoning in generating hypotheses, while in the sense in\nwhich it is used most frequently in the modern literature it refers to\nthe place of explanatory reasoning in justifying hypotheses.\nIn the latter sense, abduction is also often called “Inference\nto the Best Explanation.”',
  '\nThis entry is exclusively concerned with abduction in the modern\nsense, although there is a supplement on abduction in the historical\nsense, which had its origin in the work of Charles Sanders\nPeirce—see the',
  '\nSee also the entry on\n scientific discovery,\n in particular the section on discovery

# Prepare dataset: chuncking, add metadata, get embeddings

* To effectively search by philosophical concepts or philosophers, you need to classify the articles in the dataset into categories. Each article should be tagged with key terms, philosophical schools, or specific philosophers.
* To organize proper navigation through the articles, you can include this information in the search index (add headings to metadata for quick search by sections).

`TODO`for adding philosophical schools to metadata, you may need a dictionary search ("The Dictionary of Philosophy" или "The Cambridge Dictionary of Philosophy", Wikipedia: "Philosophical Schools"/"Philosophical Terms"). Because it's working bad: 
```python
elif ent.label_ in ["ORG", "NORP", "LANGUAGE"]:  # Organizations and religious/political groups
   concepts.add(ent.text)
```

In [6]:
import os
from datasets import load_dataset
from elasticsearch import Elasticsearch, helpers
import nltk
from nltk.tokenize import sent_tokenize
import spacy
import json
from tqdm import tqdm
from typing import List, Dict, Any
import hashlib
import pandas as pd

In [7]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m eta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [8]:
nltk.download('punkt')

# Load spaCy model for named entity recognition
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
# entity recognition and chunking
class PhilosophyDataProcessor:
    def extract_entities(self, text: str) -> tuple:
        doc = nlp(text)
        philosophers = set()
        concepts = set()
        
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                philosophers.add(ent.text)
            elif ent.label_ in ["ORG", "NORP", "LANGUAGE"]:  # Organizations and religious/political groups
                concepts.add(ent.text)
        
        return list(philosophers), list(concepts)

    def create_chunk_id(self, text: str, article_title: str, chunk_type: str) -> str:
        # Create a unique identifier for the chunk
        content = f"{text}{article_title}{chunk_type}"
        return hashlib.md5(content.encode()).hexdigest()

    def process_article(self, article: Dict[str, Any]) -> List[Dict[str, Any]]:
        chunks = []
        
        # Process preamble
        preamble_text = ' '.join(article['preamble'])
        philosophers, concepts = self.extract_entities(preamble_text)
        
        # Create full article chunk
        full_article_text = preamble_text
        for section in article['main_text']:
            section_text = ' '.join(section['main_content'])
            full_article_text += ' ' + section_text
        
        full_article_chunk = {
            'chunk_id': self.create_chunk_id(full_article_text, article['title'], 'full_article'),
            'article_title': article['title'],
            'section_path': [],
            'chunk_type': 'full_article',
            'philosophers': philosophers,
            'concepts': concepts,
            'content': full_article_text
        }
        chunks.append(full_article_chunk)
        
        # Process sections
        for section in article['main_text']:
            section_text = ' '.join(section['main_content'])
            section_philosophers, section_concepts = self.extract_entities(section_text)
            
            section_chunk = {
                'chunk_id': self.create_chunk_id(section_text, article['title'], 'section'),
                'article_title': article['title'],
                'section_path': [section['section_title']],
                'chunk_type': 'section',
                'philosophers': list(set(philosophers + section_philosophers)),
                'concepts': list(set(concepts + section_concepts)),
                'content': section_text
            }
            chunks.append(section_chunk)
            
            # Process paragraphs
            for paragraph in section['main_content']:
                if len(paragraph.split()) > 20:  
                    para_philosophers, para_concepts = self.extract_entities(paragraph)
                    
                    paragraph_chunk = {
                        'chunk_id': self.create_chunk_id(paragraph, article['title'], 'paragraph'),
                        'article_title': article['title'],
                        'section_path': [section['section_title']],
                        'chunk_type': 'paragraph',
                        'philosophers': list(set(philosophers + section_philosophers + para_philosophers)),
                        'concepts': list(set(concepts + section_concepts + para_concepts)),
                        'content': paragraph
                    }
                    chunks.append(paragraph_chunk)

            # Process subsections
            for subsection in section.get('subsections', []):
                subsection_text = ' '.join(subsection['content'])
                subsection_philosophers, subsection_concepts = self.extract_entities(subsection_text)
                
                subsection_chunk = {
                    'chunk_id': self.create_chunk_id(subsection_text, article['title'], 'subsection'),
                    'article_title': article['title'],
                    'section_path': [section['section_title'], subsection['subsection_title']],
                    'chunk_type': 'subsection',
                    'philosophers': list(set(philosophers + section_philosophers + subsection_philosophers)),
                    'concepts': list(set(concepts + section_concepts + subsection_concepts)),
                    'content': subsection_text
                }
                chunks.append(subsection_chunk)
        return chunks


In [10]:
# Initialize processor
processor = PhilosophyDataProcessor()

# Process one article as an example
example_article = dataset_subset[0]
chunks = processor.process_article(example_article)

In [11]:
df = pd.DataFrame(chunks)
df

Unnamed: 0,chunk_id,article_title,section_path,chunk_type,philosophers,concepts,content
0,92dd2f6b5d033c12efb2f8ee3047601e,Abduction,[],full_article,[Charles Sanders\nPeirce],[Inference],"\nIn the philosophical literature, the term “a..."
1,a3b5103db83e97338eb7270a86a63c65,Abduction,[1. Abduction: The General Idea],section,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...",\nYou happen to know that Tim and Harry have r...
2,81e573a9d329de10a9651c19c4907dc2,Abduction,[1. Abduction: The General Idea],paragraph,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...",\nYou happen to know that Tim and Harry have r...
3,1738ba0482fa09f893136ec9e10ebff5,Abduction,[1. Abduction: The General Idea],paragraph,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...",\nOne morning you enter the kitchen to find a ...
4,34bf01c649b729cd3d740f4d5a7cdadf,Abduction,[1. Abduction: The General Idea],paragraph,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...","\nWalking along the beach, you see what looks ..."
5,501b7b05cb58454829b17faa1ca0b67f,Abduction,[1. Abduction: The General Idea],paragraph,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...","\nIn these examples, the conclusions do not fo..."
6,6c943762de83a112fe91d3de929913e8,Abduction,"[1. Abduction: The General Idea, 1.1 Deduction...",subsection,"[Louise, Tim, Harman, Harry’s, John, Bs, Harry...","[Winston\nChurchill, Hilary Putnam’s, Dutch, I...",\nAbduction is normally thought of as being on...
7,806563744b1ad2c6fdc20c84694a9061,Abduction,"[1. Abduction: The General Idea, 1.2 The ubiqu...",subsection,"[Josephson, Harman, Harry’s, Pargetter 1984, P...","[Dascal, Thomson, Stanford, Goldman, Winston\n...",\nThe type of inference exemplified in the cas...
8,2e6cd6b7abb9bc8faf2847668c92552a,Abduction,[2. Explicating Abduction],section,"[Douven, Koslowski, Williams, Patricia Mirabil...","[Bayesian, ABD3, Lipton, Peirce, H1,, Igor Dou...",\nPrecise statements of what abduction amounts...
9,fb65974393a7620588b7c530d53e9082,Abduction,[2. Explicating Abduction],paragraph,"[Douven, Koslowski, Williams, Patricia Mirabil...","[Bayesian, ABD3, Lipton, Peirce, H1,, Igor Dou...",\nPrecise statements of what abduction amounts...


In [12]:
df[df['content'].str.contains("What leads you to the conclusion", case=False, na=False)]

Unnamed: 0,chunk_id,article_title,section_path,chunk_type,philosophers,concepts,content
0,92dd2f6b5d033c12efb2f8ee3047601e,Abduction,[],full_article,[Charles Sanders\nPeirce],[Inference],"\nIn the philosophical literature, the term “a..."
1,a3b5103db83e97338eb7270a86a63c65,Abduction,[1. Abduction: The General Idea],section,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...",\nYou happen to know that Tim and Harry have r...
5,501b7b05cb58454829b17faa1ca0b67f,Abduction,[1. Abduction: The General Idea],paragraph,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...","\nIn these examples, the conclusions do not fo..."


In [18]:
all_chunks = []


for article in tqdm(dataset_subset, desc="Processing articles"):
     chunks = processor.process_article(article)
     all_chunks.append(chunks)


In [10]:
import os
import json
import pandas as pd
from tqdm import tqdm
from itertools import chain

file_path = '../data/all_chunks_flatten.json'

# Пытаемся загрузить данные из файла, если он существует
if os.path.exists(file_path):
    print(f"Загрузка данных из {file_path}...")
    df = pd.read_json(file_path)
    all_chunks = df.to_dict(orient='records')  # Преобразуем DataFrame обратно в список словарей
else:
    print("Файл не найден, выполнение обработки статей...")
    all_chunks = []  
    for article in tqdm(dataset_subset, desc="Processing articles"):
        chunks = processor.process_article(article) 
        all_chunks.append(chunks)  
    
    def flatten_chain(matrix):
        return list(chain.from_iterable(matrix))

    flatten = flatten_chain(all_chunks)

    # Преобразуем в DataFrame и сохраняем в JSON, если нужно
    df = pd.DataFrame(flatten)
    df.to_json(file_path, orient='records', indent=4, force_ascii=False)
    print(f"Обработанные данные сохранены в {file_path}")

Загрузка данных из ../data/all_chunks_flatten.json...


In [12]:
df = pd.DataFrame(all_chunks)
df[:3]

Unnamed: 0,chunk_id,article_title,section_path,chunk_type,philosophers,concepts,content
0,92dd2f6b5d033c12efb2f8ee3047601e,Abduction,[],full_article,[Charles Sanders\nPeirce],[Inference],"\nIn the philosophical literature, the term “a..."
1,a3b5103db83e97338eb7270a86a63c65,Abduction,[1. Abduction: The General Idea],section,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...",\nYou happen to know that Tim and Harry have r...
2,81e573a9d329de10a9651c19c4907dc2,Abduction,[1. Abduction: The General Idea],paragraph,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...",\nYou happen to know that Tim and Harry have r...


## Create Embeddings using Pretrained Models

In [7]:
! pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.38.0 (from sentence_transformers)
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.38.0->sentence_transformers)
  Downloading safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers<5.0.0,>=4.38.0->sentence_transformers)
  Downloading tokenizers-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hDownloading safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.man

In [13]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [14]:
vectorized_articles = []

In [15]:
import concurrent.futures
from tqdm import tqdm

def process_doc(doc):
    section_path = doc['section_path']
    content = doc['content']
    section_path = ' '.join(section_path)
    content = ' '.join(content)
    
    doc['vector_sectionPath_content'] = model.encode(section_path + ' ' + content)
    return doc

def parallel_processing_doc(chunks_to_process):
    # Parallel processing with ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_doc, doc) for doc in chunks_to_process]

        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            vectorized_articles.append(future.result())

In [1]:
### Note: I couldn't wait, so I only processed some of the elements. You can do it too.

In [16]:
chunks_from = 7237
chunks_to = len(all_chunks)
parallel_processing_doc(all_chunks[chunks_from:chunks_to])

 50%|███████████████████████████████████▏                                   | 7237/14578 [48:31<49:13,  2.49it/s]


KeyboardInterrupt: 

In [17]:
len(vectorized_articles)

7237

In [19]:
df = pd.DataFrame(vectorized_articles)
df.to_json('../data/vectorized_articles.json')
df

Unnamed: 0,chunk_id,article_title,section_path,chunk_type,philosophers,concepts,content,vector_sectionPath_content
0,81e573a9d329de10a9651c19c4907dc2,Abduction,[1. Abduction: The General Idea],paragraph,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...",\nYou happen to know that Tim and Harry have r...,"[0.04034751, -0.010665092, 0.017695744, -0.001..."
1,1738ba0482fa09f893136ec9e10ebff5,Abduction,[1. Abduction: The General Idea],paragraph,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...",\nOne morning you enter the kitchen to find a ...,"[0.039968688, -0.035628095, 0.010803069, -0.00..."
2,806563744b1ad2c6fdc20c84694a9061,Abduction,"[1. Abduction: The General Idea, 1.2 The ubiqu...",subsection,"[Josephson, Harman, Harry’s, Pargetter 1984, P...","[Dascal, Thomson, Stanford, Goldman, Winston\n...",\nThe type of inference exemplified in the cas...,"[0.049834765, -0.0046678144, 0.013743385, -0.0..."
3,34bf01c649b729cd3d740f4d5a7cdadf,Abduction,[1. Abduction: The General Idea],paragraph,"[Tim, Harry’s, Harry, Charles Sanders\nPeirce,...","[the Best\nExplanation, Hilary Putnam’s, Infer...","\nWalking along the beach, you see what looks ...","[0.03520298, -0.013972833, 0.021752857, 0.0023..."
4,6c943762de83a112fe91d3de929913e8,Abduction,"[1. Abduction: The General Idea, 1.1 Deduction...",subsection,"[Louise, Tim, Harman, Harry’s, John, Bs, Harry...","[Winston\nChurchill, Hilary Putnam’s, Dutch, I...",\nAbduction is normally thought of as being on...,"[0.021810336, 0.0011034051, 0.017931415, 0.011..."
...,...,...,...,...,...,...,...,...
7232,02cd9ef520b1d9b6c9bc1c278c858364,Francis Herbert Bradley,[5. Logic],paragraph,"[Hermann Lotze, Hegel, Bertrand\nRussell, McTa...","[Hegelian, British, Humean, British Idealists,...",\nThis may be called the law of Redintegration...,"[-0.040556494, 0.010700114, -0.008341639, 0.01..."
7233,ec45ed3889690568cd9029036fda09f3,Francis Herbert Bradley,[5. Logic],paragraph,"[Hermann Lotze, Hegel, Bertrand\nRussell, McTa...","[Hegelian, British, Humean, British Idealists,...",\nBradley’s own account of judgment is that it...,"[0.006766742, 0.026081122, 0.020721044, 0.0448..."
7234,679a9a016377af62b58a9b542e3ab962,Francis Herbert Bradley,[5. Logic],paragraph,"[Hermann Lotze, Hegel, Bertrand\nRussell, McTa...","[Hegelian, British, Humean, British Idealists,...",\nBradley continues to criticize traditional l...,"[-0.0149703985, 0.0056009805, -0.0011972125, 0..."
7235,2d50d20ecdf273ef8eb0de14d0af5463,Francis Herbert Bradley,[5. Logic],paragraph,"[Hermann Lotze, Hegel, Bertrand\nRussell, McTa...","[Hegelian, British, Humean, British Idealists,...",\nDespite these significant steps in the direc...,"[-0.012884118, 0.020164268, -0.001750762, 0.01..."


## ElasticSearch Ingestion (with Vector Indexing)

Running ElasticSearch locally (you don't need it, if you use my hosted elastic):
```
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [1]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [12]:
import os
from dotenv import load_dotenv
from elasticsearch import Elasticsearch, helpers

load_dotenv()
print(f"{os.getenv('ELASTIC_HOST')}:{os.getenv('ELASTIC_PORT')}")

#es_client = Elasticsearch(f"{os.getenv('ELASTIC_HOST')}:{os.getenv('ELASTIC_PORT')}", api_key=os.getenv('ELASTIC_API_KEY'))
#es_client.info()

None:None


In [None]:
import os
import json
import pandas as pd

# Путь к файлу JSON
file_path = '../data/vectorized_articles.json'

# Пытаемся загрузить данные из файла, если он существует
if os.path.exists(file_path):
    print(f"Загрузка данных из {file_path}...")
    df = pd.read_json(file_path)  # Загружаем JSON в DataFrame
    to_index_vectorized_articles = df.to_dict(orient='records')  # Преобразуем DataFrame в список словарей
else:
    print("Файл не найден, создайте или загрузите данные для индексации.")
    to_index_vectorized_articles = []  # Пустой список или замена на ваш процесс получения данных

# Проверяем, есть ли данные для индексации
if to_index_vectorized_articles:
    # Выполняем индексацию в Elasticsearch
    index_articles(es_client, index_name, to_index_vectorized_articles)
    print("Индексация завершена.")
else:
    print("Нет данных для индексации.")

In [88]:
from elasticsearch import Elasticsearch, helpers
from tqdm.auto import tqdm

class ElasticIngestion:
    def __init__(self, es_client): 
        self.index_name = "philosophy"
        self.setup_elasticsearch()

    def setup_elasticsearch(self):
        # Define the index mapping
        index_settings = {
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0
            },
            "mappings": {
                "properties": {
                    "chunk_id": {"type": "keyword"},
                    "article_title": {"type": "text"},
                    "section_path": {"type": "keyword"},
                    "chunk_type": {"type": "keyword"},
                    "philosophers": {"type": "keyword"},
                    "concepts": {"type": "keyword"},
                    "content": {"type": "text"},
                    "vector_sectionPath_content": {
                        "type": "dense_vector",
                        "dims": 768, # for all-mpnet-base-v2 model,
                        "index": True, 
                        "similarity": "cosine"
                    }
                }
            }
        }
        # Create the index if it doesn't exist
        if not es_client.indices.exists(index=self.index_name):
            es_client.indices.create(index=self.index_name, body=index_settings)

### Add chunks to index

In [25]:
from tqdm import tqdm
from elasticsearch import helpers, Elasticsearch

def index_articles(es_client, index_name, vectorized_articles):
    actions = [
        {
            "_index": index_name,
            "_id": chunk['chunk_id'],
            "_source": chunk
        }
        for chunk in vectorized_articles
    ]
    
    for ok, response in tqdm(helpers.streaming_bulk(es_client, actions), total=len(actions), desc="Indexing articles"):
        if not ok:
            print(f"Failed to index document: {response}")

index_name = 'philosophy'

# Ingest vectorized articles with tqdm progress tracking
index_articles(es_client, index_name, vectorized_articles)

Indexing articles: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7237/7237 [01:23<00:00, 86.52it/s]


In [26]:
# Simple Test query to get one document
response = es_client.search(
    index="philosophy",
    body={
        "query": {
            "match_all": {}
        },
        "size": 1
    }
)
print("First document in Elasticsearch:")
json.dumps(response['hits']['hits'][0]['_source'], indent=2)

First document in Elasticsearch:


'{\n  "chunk_id": "81e573a9d329de10a9651c19c4907dc2",\n  "article_title": "Abduction",\n  "section_path": [\n    "1. Abduction: The General Idea"\n  ],\n  "chunk_type": "paragraph",\n  "philosophers": [\n    "Tim",\n    "Harry\\u2019s",\n    "Harry",\n    "Charles Sanders\\nPeirce",\n    "Churchill"\n  ],\n  "concepts": [\n    "the Best\\nExplanation",\n    "Hilary Putnam\\u2019s",\n    "Inference",\n    "Winston\\nChurchill"\n  ],\n  "content": "\\nYou happen to know that Tim and Harry have recently had a terrible row\\nthat ended their friendship. Now someone tells you that she just saw\\nTim and Harry jogging together. The best explanation for this that you\\ncan think of is that they made up. You conclude that they are friends\\nagain.",\n  "vector_sectionPath_content": [\n    0.04034750908613205,\n    -0.010665091685950756,\n    0.017695743590593338,\n    -0.0017783143557608128,\n    -0.054355766624212265,\n    -0.006915545556694269,\n    0.01386959757655859,\n    -0.0185397062450