In [2]:
import logging

logging.basicConfig(
    format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING
)
logging.getLogger("haystack").setLevel(logging.INFO)

In [5]:
from haystack.utils import fetch_archive_from_http

doc_dir = "data/test_docs"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial8.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

True

In [7]:
from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter


converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_txt = converter.convert(file_path=f"{doc_dir}/classics.txt", meta=None)[0]

converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_pdf = converter.convert(file_path=f"{doc_dir}/bert.pdf", meta=None)[0]

converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"])
doc_docx = converter.convert(file_path=f"{doc_dir}/heavy_metal.docx", meta=None)[0]

In [8]:
from haystack.nodes import PreProcessor

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
)
docs_default = preprocessor.process([doc_txt])
print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")

Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

n_docs_input: 1
n_docs_output: 51


In [9]:
preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False)
docs_nrsb = preprocessor_nrsb.process([doc_txt])

print("RESPECTING SENTENCE BOUNDARY")
end_text = docs_default[0].content[-50:]
print('End of document: "...' + end_text + '"')
print()
print("NOT RESPECTING SENTENCE BOUNDARY")
end_text_nrsb = docs_nrsb[0].content[-50:]
print('End of document: "...' + end_text_nrsb + '"')

Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

RESPECTING SENTENCE BOUNDARY
End of document: "...rnerstone of a typical elite European education.

"

NOT RESPECTING SENTENCE BOUNDARY
End of document: "...xts used as part of a curriculum, both derive from"


In [10]:
preprocessor_sliding_window = PreProcessor(
    split_overlap=3, split_length=10, split_respect_sentence_boundary=False
)
docs_sliding_window = preprocessor_sliding_window.process([doc_txt])

doc1 = docs_sliding_window[0].content[:200]
doc2 = docs_sliding_window[1].content[:100]
doc3 = docs_sliding_window[2].content[:100]

print('Document 1: "' + doc1 + '..."')
print('Document 2: "' + doc2 + '..."')
print('Document 3: "' + doc3 + '..."')

Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

Document 1: "Classics or classical studies is the study of classical antiquity,..."
Document 2: "of classical antiquity, and in the Western world traditionally refers..."
Document 3: "world traditionally refers to the study of Classical Greek and..."


In [4]:
from haystack.nodes import PreProcessor
from haystack.utils import convert_files_to_docs

doc_dir = "data/test_docs"

all_docs = convert_files_to_docs(dir_path=doc_dir)
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
)
docs = preprocessor.process(all_docs)

print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}")

Preprocessing:   0%|          | 0/2 [00:00<?, ?docs/s]

n_files_input: 2
n_docs_output: 54
