In [8]:
from langchain_community.document_loaders import Docx2txtLoader
from transformers import pipeline
import re
import json

### 1 - NER using rule-based parser

In [2]:
#Lire un fichier docx et extraire son texte.
def extract_text_from_docx(docx_path):
    data  = Docx2txtLoader(docx_path).load()
    return data[0].page_content.strip()

#Génèrer les regex à des noms d'entités qu'on souhaite extraire
def format_entity_patterns(entity_names):
    return [rf"{re.escape(name)}\s*\n(.+)" for name in entity_names]


#Appliquer les regex pour extraire des entités nommées
def extract_entities(text,
                      entities_names =  ["Counterparty", "Initial Valuation Date", "Notional", "Valuation Date", "Maturity", "Underlying", "Coupon", "Barrier", "Calendar"], 
                      entities_to_extract = ["Party A", "Initial Valuation Date", "Notional Amount (N)", "Valuation Date", "Termination Date",  "Underlying", "Coupon (C)", "Barrier (B)", "Business Day"]):
    entities = {}
    patterns = dict(zip(entities_names,  format_entity_patterns(entities_to_extract)))
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        entities[key] = match.group(1) if match else None
    return entities

In [3]:
docx_text = extract_text_from_docx("data/ZF4894_ALV_07Aug2026_physical.docx")
extract_entities(docx_text)

{'Counterparty': 'BANK ABC',
 'Initial Valuation Date': '31 January 2025',
 'Notional': 'EUR 1 million',
 'Valuation Date': '31 January 2025',
 'Maturity': '07 August 2026',
 'Underlying': 'Allianz SE (ISIN DE0008404005, Reuters: ALVG.DE)',
 'Coupon': '0%',
 'Barrier': '75.00% of Shareini  ',
 'Calendar': 'TARGET '}

### 2 - NER using open soure (hugging face) model

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
def read_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()
    
def model_loader(model_path) :
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    return pipeline("ner", model=model, tokenizer=tokenizer,  aggregation_strategy="first")


def extract_named_entities_from_text(ner_pipeline, text) :
    results = ner_pipeline(text)
    entities = [{k: d[k] for k in ["entity_group", "score", "word"]} for d in results]
    return entities

In [6]:
model_path = "./pretrained-models/distilbert-NER"
ner_pipeline = model_loader(model_path)

text = read_text_file("data/FR001400QV82_AVMAFC_30Jun2028.txt")
extract_named_entities_from_text(ner_pipeline, text)

[{'entity_group': 'ORG', 'score': 0.877728, 'word': 'BANK ABC'}]

### NER for Verbose and complex PDF using RAG approach

In [7]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from src.llm_and_rag_ner import (extract_named_entities,
                                retrieve_relevant_data,
                                load_pdf,
                                index_documents)

model_path = "pretrained-models/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embedding_model = HuggingFaceEmbeddings(
    model_name=model_path,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
# Indexation
index_path = "./faiss-index"
chunks = load_pdf("./data/BankABC_TermSheet_Template.pdf")
index_documents(chunks, embedding_model, index_path)

In [None]:
######## Retrieving #########
retriever = FAISS.load_local(index_path, embedding_model, allow_dangerous_deserialization=True)
query = """Consent of the Investor shall be required for any action that 
          (i) alters or changes the rights, preference or privileges of the 
          Preference Shares, (ii) increases or decreases the authorized 
          number of shares of equity or Preference Shares"""
aggragated_retrieved_text = retrieve_relevant_data(retriever, query)
print(aggragated_retrieved_text)

print('\n')

######## NER using  #########
entities = extract_named_entities(aggragated_retrieved_text, 'mistral')
print(json.dumps(entities, indent=2, ensure_ascii=False))

Ignoring wrong pointing object 11 0 (offset 0)


starting indexation in FAISS
indexation terminated



[Document(page_content='5 to a vote of shareholders of the Company. Subject to the Protective Provisions, the Preference Shares and the equity shares will have one vote per share determined on an as-converted basis. Protective Provisions: Consent of the Investor shall be required for any action that (i) alters or changes'), Document(page_content='the rights, preference or privileges of the Preference Shares, (ii) increases or decreases the authorized number of shares of equity or Preference Shares, (iii) creates any new class or series of shares having rights, preference or privileges senior to or on a parity with any outstanding series of'), Document(page_content='including all shares warrants and employee options for equity shares granted), with rights of subscription as to any unsubscribed shares. Voluntary Conversion: The Investor shall have the right to convert the Preference Shares at any time, at its option, into equity share

In [11]:
####### Test avec llama
model='llama3.2'
text = "Elon Musk founded SpaceX in Los Angeles. Apple and Google are investing in France."
entities = extract_named_entities(text, model)
print(json.dumps(entities, indent=2, ensure_ascii=False))

NER performing
NER finished
{
  "ORG": [],
  "MONEY": [
    "$10 million"
  ],
  "DATE": [
    "Q1 2023",
    "January 2025"
  ],
  "INSTRUMENT": [
    "Convertible Preference Shares"
  ],
  "PERCENT": [
    "25% IRR"
  ],
  "SHAREHOLDER": [],
  "EXIT_STRATEGY": [],
  "LEGAL_TERM": []
}


In [3]:
import ollama

ModuleNotFoundError: No module named 'ollama'