In [19]:
import os
from PIL import Image
import pypdfium2 as pdfium
from io import BytesIO
from easyocr import Reader

In [13]:
! pip install easyocr


Collecting easyocr
  Using cached easyocr-1.7.1-py3-none-any.whl (2.9 MB)
Collecting torchvision>=0.5 (from easyocr)
  Downloading torchvision-0.17.2-cp38-cp38-manylinux1_x86_64.whl (6.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hCollecting opencv-python-headless (from easyocr)
  Using cached opencv_python_headless-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.6 MB)
Collecting scikit-image (from easyocr)
  Using cached scikit_image-0.21.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.9 MB)
Collecting python-bidi (from easyocr)
  Using cached python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Collecting Shapely (from easyocr)
  Using cached shapely-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
Collecting pyclipper (from easyocr)
  Using cached pyclipper-1.3.0.post5-cp38-cp38-manylinux_2_5_x86_6

In [17]:
def convert_pdf_to_images(file_path, scale=300/72):
    
    pdf_file = pdfium.PdfDocument(file_path)  
    page_indices = [i for i in range(len(pdf_file))]
    
    renderer = pdf_file.render(
        pdfium.PdfBitmap.to_pil,
        page_indices = page_indices, 
        scale = scale,
    )
    
    list_final_images = [] 
    
    for i, image in zip(page_indices, renderer):
        
        image_byte_array = BytesIO()
        image.save(image_byte_array, format='jpeg', optimize=True)
        image_byte_array = image_byte_array.getvalue()
        list_final_images.append(dict({i:image_byte_array}))
    
    return list_final_images

convert_pdf_to_images = convert_pdf_to_images('fund-factsheet-for-september-2023.pdf')

In [20]:
language_reader = Reader(["en"])
def extract_text_with_easyocr(list_dict_final_images):
    
    image_list = [list(data.values())[0] for data in list_dict_final_images]
    image_content = []
    
    for index, image_bytes in enumerate(image_list):
        
        image = Image.open(BytesIO(image_bytes))
        raw_text = language_reader.readtext(image)
        raw_text = "\n".join([res[1] for res in raw_text])
                       
        image_content.append(raw_text)
    
    return "\n".join(image_content)

text_with_easy_ocr = extract_text_with_easyocr(convert_pdf_to_images)
print(text_with_easy_ocr)

THE
0ICICI
PRUJENTIAL
PRUEENT
MUTUAL
FUND
FACT SHEET
September 30,.2023
Build your investments with the
growing manufacturing theme
Invest in
ICICI Prudential
Manufacturing Fund
To invest; contact your Mutual Fund Distributor
IPRUTOUCH
WWWI
iciciprumf.com
ICICI Prudential Manufacturing Fund (An Open Ended Equity Scheme following manufacturing theme:)
is suitable for investors who are seeking*=
Investors
Long term wealth creation
understand that
An open ended equity scheme that aims to provide capital appreciation by investing in equity and
%6,
their principal
8
equity related securities of companies engaged in manufacturing theme:
g
will be at
3
3
High risk
#Investors should consult their financial advisers if in doubt about whether the product is suitable for them:
The Risk-o-meter specified above will be evaluated and updated on a monthly basis Please refer https:IIwww icicipruamccom/news-and-updateslall-news
for more details on scheme riskometers_
Mutual Fund investments are subject

AS we observe, the OCR alone is not able to extract the text in desired format. Hence we should utilize multimodal tools that can recognize the various entities like descriptions, tables etc. and extract them separately. Table transformer, flamingo etc can be taken to use

In [22]:
#Chunking fund data

def chunkByFundDetails(data):
    chunks = {}
    for row in data:
        fund_name = row['Fund Name']
        if fund_name not in chunks:
            chunks[fund_name] = []
        chunks[fund_name].append(row)
    return chunks

def chunkByFundPerformance(data):
    performance_data = []
    fund_manager_data = []
    for row in data:
        performance_chunk = {
            'Fund Name': row['Fund Name'],
            'Performance Data': {
                'Date': row['Date'],
                'Returns': row['Returns']
            }
        }
        fund_manager_chunk = {
            'Fund Name': row['Fund Name'],
            'Fund Manager': row['Fund Manager']
        }
        performance_data.append(performance_chunk)
        fund_manager_data.append(fund_manager_chunk)
    return performance_data, fund_manager_data

In [None]:
#Query Analysis using topic modelling

import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('punkt')

def preprocess_text_nltk(text):
    tokens = word_tokenize(text.lower())
    return tokens

def extract_features_nltk(text_data):
    vectorizer = CountVectorizer(tokenizer=preprocess_text_nltk)
    X = vectorizer.fit_transform(text_data)
    feature_names = vectorizer.get_feature_names()
    return X, feature_names


def AnalyseQuery(query):
    query_corpus = [
        "top performing mutual funds",
        "fund manager"
    ]
    X, feature_names = extract_features_nltk(query_corpus)

    n_topics = 2  # Assuming 2 main topics: top_funds, fund_manager
    lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda_model.fit(X)

    tokens = preprocess_text_nltk(query)
    query_vec = lda_model.transform(vectorizer.transform([' '.join(tokens)]))
    dominant_topic = query_vec.argmax()

    # Generate response based on topic
    if dominant_topic == 0:
        return "top_funds"
    elif dominant_topic == 1:
        print("Query pertains to fund manager.")
        return "fund_manager"
    else:
        print("Unknown query topic.")

In [None]:
# Retrieval

def retrieve_top_performing_funds(chunks):
    all_funds_performance = {}
    for fund, data in chunks.items():
        returns = [float(row['Returns']) for row in data]
        average_return = sum(returns) / len(returns)
        all_funds_performance[fund] = average_return
    sorted_funds = sorted(all_funds_performance.items(), key=operator.itemgetter(1), reverse=True)
    top_funds = sorted_funds[:3]
    return top_funds

def retrieve_fund_manager(chunks):
    fund_managers = {}
    for fund, data in chunks.items():
        fund_manager = data[0]['Fund Manager']
        fund_managers[fund] = fund_manager
    return fund_managers

def retrieve_information(query, chunks):    
    if AnalyseQuery(query) == "top_funds":
        return retrieve_top_performing_funds(chunks)
    elif AnalyseQuery(query) == "fund_manager":
        return retrieve_fund_manager(chunks)



In [None]:

# Response Generation
def generate_response(query_type, retrieved_info):
    if query_type == "top_funds":
        response = "The top 3 performing mutual funds are:"
        for fund, returns in retrieved_info:
            response += f"\n- {fund}: {returns}%"
    elif query_type == "fund_manager":
        response = "Fund managers for the specified funds are:"
        for fund, manager in retrieved_info.items():
            response += f"\n- {fund}: {manager}"
    else:
        response = "Sorry, I couldn't understand the query."
    return response

In [None]:
def main():
    # load the data extracted form the PDF into CSV files
    funds_data = load_data_from_csv('funds_data.csv')

    # Chunk the document
    fundChunks = chunkByFundDetails(funds_data)
    fundPerformanceChunks, fundManagerChunks = chunkByFundPerformance(funds_data)

    # Example queries
    queries = [
        "What are the top performing mutual funds?",
        "Who is the fund manager of ICICI BlueChip Fund?"
    ]

    for query in queries:
        # Step 2: Analyze the query
        query_type = AnalyseQuery(query)

        # Step 3: Retrieve information based on query type
        if query_type == "top_funds":
            retrieved_info = retrieve_information(query_type, fundChunks)
        elif query_type == "fund_manager":
            retrieved_info = retrieve_information(query_type, fundManagerChunks)
        else:
            retrieved_info = None

        # Step 4: Generate response
        response = generate_response(query_type, retrieved_info)
        print(response)

if __name__ == "__main__":
    main()
