In [18]:
import warnings
import json
warnings.simplefilter(action='ignore', category=FutureWarning)

### Import des modèles

In [19]:

embedding_model_path = "pretrained-models/all-MiniLM-L6-v2"
ner_model_path = "./pretrained-models/distilbert-NER"

### 1 - NER using rule-based parser

In [20]:
from src.rules_based_ner import  (extract_entities_rules_based,
                                  extract_text_from_docx)

In [21]:
docx_text = extract_text_from_docx("data/ZF4894_ALV_07Aug2026_physical.docx")
extract_entities_rules_based(docx_text)

{'Counterparty': 'BANK ABC',
 'Initial Valuation Date': '31 January 2025',
 'Notional': 'EUR 1 million',
 'Valuation Date': '31 January 2025',
 'Maturity': '07 August 2026',
 'Underlying': 'Allianz SE (ISIN DE0008404005, Reuters: ALVG.DE)',
 'Coupon': '0%',
 'Barrier': '75.00% of Shareini  ',
 'Calendar': 'TARGET '}

### 2 - NER using open soure (hugging face) model

In [22]:
from src.open_source_model_ner import (model_loader,
                                       read_text_file,
                                       extract_named_entities_open_source_model)

In [23]:
ner_pipeline = model_loader(ner_model_path)

text = read_text_file("data/FR001400QV82_AVMAFC_30Jun2028.txt")
extract_named_entities_open_source_model(ner_pipeline, text)

[{'entity_group': 'ORG', 'score': 0.877728, 'word': 'BANK ABC'}]

### NER for Verbose and complex PDF using RAG approach

In [26]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from src.llm_and_rag_ner import (extract_named_entities,
                                retrieve_relevant_data,
                                load_pdf,
                                index_documents)

model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedding_model = HuggingFaceEmbeddings(
    model_name=embedding_model_path,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [27]:
# Indexation
index_path = "./faiss-index"
chunks = load_pdf("./data/BankABC_TermSheet_Template.pdf")
index_documents(chunks, embedding_model, index_path)

Ignoring wrong pointing object 11 0 (offset 0)


starting indexation in FAISS
indexation terminated


In [28]:
######## Retrieving #########
retriever = FAISS.load_local(index_path, embedding_model, allow_dangerous_deserialization=True)
query = """Consent of the Investor shall be required for any action that 
          (i) alters or changes the rights, preference or privileges of the 
          Preference Shares, (ii) increases or decreases the authorized 
          number of shares of equity or Preference Shares"""
aggragated_retrieved_text = retrieve_relevant_data(retriever, query)
print(aggragated_retrieved_text)

print('\n')

######## NER Performing  #########
entities = extract_named_entities(aggragated_retrieved_text, 'mistral')
print(json.dumps(entities, indent=2, ensure_ascii=False))

5 to a vote of shareholders of the Company. Subject to the Protective Provisions, the Preference Shares and the equity shares will have one vote per share determined on an as-converted basis. Protective Provisions: Consent of the Investor shall be required for any action that (i) alters or changes
the rights, preference or privileges of the Preference Shares, (ii) increases or decreases the authorized number of shares of equity or Preference Shares, (iii) creates any new class or series of shares having rights, preference or privileges senior to or on a parity with any outstanding series of
including all shares warrants and employee options for equity shares granted), with rights of subscription as to any unsubscribed shares. Voluntary Conversion: The Investor shall have the right to convert the Preference Shares at any time, at its option, into equity shares and shall be subject to
drag along right being enforced by any other investor. Proportionate ownership: The Investor shall have 

ResponseError: model requires more system memory (5.5 GiB) than is available (5.3 GiB) (status code: 500)