Initialize tesseract for ocr

In [1]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:/Program Files/Tesseract-OCR/tesseract.exe"
pytesseract.get_tesseract_version()



<Version('5.3.0.20221222')>

Preprocessing pipeline for any kind of file

In [2]:
from haystack.nodes import TextConverter, FileTypeClassifier, PDFToTextOCRConverter, MarkdownConverter, DocxToTextConverter, PreProcessor
from haystack.pipelines import Pipeline
from haystack.document_stores import FAISSDocumentStore

file_type_classifier = FileTypeClassifier()

text_converter = TextConverter()
pdf_converter = PDFToTextOCRConverter(valid_languages=["eng","pol"],remove_numeric_tables=True)
md_converter = MarkdownConverter()
docx_converter = DocxToTextConverter()

document_store = FAISSDocumentStore(embedding_dim=384, faiss_index_factory_str="Flat")

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=0,
    add_page_number=True,

)

# This is an indexing pipeline
p = Pipeline()

p.add_node(component=file_type_classifier, name="FileTypeClassifier", inputs=["File"])

p.add_node(component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"])
p.add_node(component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"])
p.add_node(component=md_converter, name="MarkdownConverter", inputs=["FileTypeClassifier.output_3"])
p.add_node(component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"])

p.add_node(
    component=preprocessor,
    name="Preprocessor",
    inputs=["TextConverter", "PdfConverter", "MarkdownConverter", "DocxConverter"],
)

  from .autonotebook import tqdm as notebook_tqdm


Pure slides location

In [3]:
files = open("da_slides.txt").read().splitlines()
files

['decision_analysis/da-lec1.pdf',
 'decision_analysis/da-lec2.pdf',
 'decision_analysis/da-lec3.pdf',
 'decision_analysis/da-lec4.pdf',
 'decision_analysis/da-lec5.pdf',
 'decision_analysis/da-lec6.pdf',
 'decision_analysis/da-lec7.pdf',
 'decision_analysis/da-lec8.pdf',
 'decision_analysis/da-lec9.pdf',
 'decision_analysis/da-lec10.pdf',
 'decision_analysis/da-lec11.pdf',
 'decision_analysis/da-lec12.pdf']

Document metadata lacks information about file origin, add it

In [4]:
def write_filename_metadata(documents, file_name):
    for doc in documents:
        doc.meta["file_name"] = file_name
    return documents

Run OCR and preprocessing on all slides

In [5]:
for file in files:
    result = p.run(file_paths=file)
    documents = write_filename_metadata(result["documents"], file)
    document_store.write_documents(result["documents"])

Converting files: 100%|██████████| 1/1 [00:56<00:00, 56.29s/it]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  7.03docs/s]
Writing Documents: 10000it [00:00, 52661.23it/s]         
Converting files: 100%|██████████| 1/1 [01:00<00:00, 60.20s/it]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  6.81docs/s]
Writing Documents: 10000it [00:00, 45668.19it/s]         
Converting files: 100%|██████████| 1/1 [00:53<00:00, 53.55s/it]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  6.49docs/s]
Writing Documents: 10000it [00:00, 58000.95it/s]         
Converting files: 100%|██████████| 1/1 [01:08<00:00, 68.11s/it]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  5.06docs/s]
Writing Documents: 10000it [00:00, 40668.65it/s]         
Converting files: 100%|██████████| 1/1 [00:58<00:00, 58.43s/it]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  6.34docs/s]
Writing Documents: 10000it [00:00, 63515.03it/s]         
Converting files: 100%|██████████| 1/1 [01:01<00:00, 61.09s/it]
Preprocessi

Fetch transcriptions, transform into documents, add metadata about origin

In [6]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
    split_overlap=0,
    add_page_number=True,

)

In [7]:
files = open("da_filepaths.txt").read().splitlines()
files

['decision_analysis/da-lec1-notes.pdf',
 'decision_analysis/da-lec2-notes.pdf',
 'decision_analysis/da-lec3-notes.pdf',
 'decision_analysis/da-lec4-notes.pdf',
 'decision_analysis/da-lec5-notes.pdf',
 'decision_analysis/da-lec6-notes.pdf',
 'decision_analysis/da-lec7-notes.pdf',
 'decision_analysis/da-lec8-notes.pdf',
 'decision_analysis/da-lec9-notes.pdf',
 'decision_analysis/da-lec10-notes.pdf',
 'decision_analysis/da-lec11-notes.pdf',
 'decision_analysis/da-lec12-notes.pdf']

In [8]:
# import document class from haystack
from haystack import Document
import lectures_format
da_lectures = lectures_format.DecisionAnalysisLecture(files)
documents = []
for i,lecture in enumerate(da_lectures.lectures):
    file_name = files[i]
    transcriptions = da_lectures.find_transcriptions(lecture)
    for j,transcription in enumerate(transcriptions):
        documents.append(Document(transcription,meta={"file_name":file_name,"page_number":j}))
        

In [9]:
new_docs = preprocessor.process(documents)
len(new_docs)
document_store.write_documents(new_docs)

Preprocessing: 100%|██████████| 496/496 [00:00<00:00, 1569.13docs/s]
Writing Documents: 10000it [00:03, 2524.04it/s]            


In [10]:
document_store.save("decision_analysis.faiss")

Run embedding model for all the data we gathered

In [12]:
from haystack.nodes import DensePassageRetriever

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model='sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
    passage_embedding_model='sentence-transformers/multi-qa-MiniLM-L6-cos-v1',
)

document_store.update_embeddings(retriever)

Downloading (…)okenizer_config.json: 100%|██████████| 383/383 [00:00<00:00, 174kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 402kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 486kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 52.5kB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
Downloading (…)lve/main/config.json: 100%|██████████| 612/612 [00:00<00:00, 245kB/s]


In [13]:
document_store.save("decision_analysis.faiss")

In [14]:
from haystack.utils import print_documents
from haystack.pipelines import DocumentSearchPipeline

p_retrieval = DocumentSearchPipeline(retriever)
res = p_retrieval.run(query="what is swing method ?", params={"Retriever": {"top_k": 3}})
print_documents(res, max_text_len=200)


Query: what is swing method ?

{   'content': 'It is enough to look at the explanation given at the bottom. '
               'In the SWING method, we smartly ask for comprehensive values '
               'of alternatives that are equal to the weights of individual '
               'criteria. Therefor...',
    'name': None}

{   'content': 'To determine the criteria weights, many techniques exist, but '
               'one that has gained great popularity is called the SWING '
               'method. It requires creating n+1 ﬁctive alternatives, where n '
               'is the number of cr...',
    'name': None}

{   'content': 'Having understood the bisection and SWING methods, you can '
               'intuitively feel that the preference information they require '
               'is rather demanding. Also, other methods you already know, '
               'such as ELECTRE or P...',
    'name': None}



In [15]:
from haystack.nodes import Seq2SeqGenerator


generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")


  return self.fget.__get__(instance, owner)()


In [16]:
from haystack.pipelines import GenerativeQAPipeline

pipe = GenerativeQAPipeline(generator, retriever)


In [19]:
answers = pipe.run(
    query="what is a fictive alternative?", params={"Retriever": {"top_k": 3}}
)


In [20]:
answers

{'query': 'what is a fictive alternative?',
 'answers': [<Answer {'answer': "A fictive alternative is a non-ideal alternative. For example, let's say you want to build a house, but you don't know how to build it. You have two options: 1. You can build it by yourself, or you can hire someone to do it for you. 2. You could hire someone else to build the house for you, and then you can pay them to do the work. If you choose the first option, the house will be built by yourself. If the second option is built by someone else, you'll have to pay them for doing the work, and you won't have any money left over to spend on your own house.", 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_ids': ['79dc4768404f447e13a4435004bc7e09', 'e536e4b6d41dbe4c8e80de811eb7fc36', 'd82e502dfb1b80189182a3cc716732e2'], 'meta': {'doc_scores': [0.6779102613036477, 0.6732210293515595, 0.6710447713796783], 'content': ['The pessimistic rule is e