-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_markdown.py
73 lines (61 loc) · 2.15 KB
/
process_markdown.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import markdown
import os
import re
from bs4 import BeautifulSoup
DATE_PATTERN = r"([A-Z][a-z]{2})\s+(\d{1,2}),\s+(\d{4})"
def extract_text_from_markdown(file_path):
"""Extracts the text from all .md files (located in 'file_path') as paragraph blocks:
Returns a list of dictionaries with keys: 'Title', 'Paraph_id', 'Text' and 'Embedding'
The Embedding value is left empty as it will be filled when processing all embeddings.
"""
document_paragraphs = []
paragraph_num = 0
with open(file_path, 'r', encoding='utf-8') as file:
markdown_text = file.read()
# Convert Markdown to HTML
html = markdown.markdown(markdown_text)
# Parse HTML and extract text
soup = BeautifulSoup(html, 'html.parser')
title = soup.find('h1')
paragraphs = soup.find_all('p')
for p in paragraphs:
if p.get_text() == "" or p.get_text() is None:
continue
elif re.match(DATE_PATTERN, p.get_text()) is not None:
continue
# Check if paragraph is just new lines and skip it
text = p.get_text().replace("\n", "")
if text == "":
continue
p = f"{p.get_text()}".rstrip()
paragraph_num += 1
document_paragraphs.append({
"Title": title.get_text(),
"Paragraph_id": str(paragraph_num),
"Text": p,
"Embedding": None
})
return document_paragraphs
def process_markdown_folder(folder_path):
"""Processes all .md files in the folder.
Returns an array of dictionaries with the text of all files separated by paragraph as:
]
{
"Title": "<title>",
"Paragraph_id": "<paragraph number>",
"Text": "<text in paragraph>",
"Embedding": None
}
]
"""
all_texts = []
if not os.path.exists(folder_path):
print(f"Folder '{folder_path}' does not exist.")
return
for filename in os.listdir(folder_path):
if filename.endswith('.md'):
file_path = os.path.join(folder_path, filename)
file_sections = extract_text_from_markdown(file_path)
for i in range(len(file_sections)):
all_texts.append(file_sections[i])
return all_texts