Skip to content

Commit

Permalink
[NeuralChat] Retrieval pdf figure to text (#1264)
Browse files Browse the repository at this point in the history
* Retrieval pdf figure to text 

Signed-off-by: Liangyx2 <yuxiang.liang@intel.com>
  • Loading branch information
Liangyx2 committed Feb 22, 2024
1 parent 47a7280 commit d6a66b3
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 18 deletions.
3 changes: 2 additions & 1 deletion intel_extension_for_transformers/neural_chat/chatbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def check_cache_dependency():

def check_retrieval_dependency():
try:
importlib.import_module('PyPDF2')
importlib.import_module('fitz')
importlib.import_module('easyocr')
importlib.import_module('langchain')
importlib.import_module('langchain_core')
importlib.import_module('docx')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,16 @@
# limitations under the License.

import unicodedata
import PyPDF2
import pandas as pd
import re, json
from langchain.document_loaders import UnstructuredMarkdownLoader
from docx import Document as DDocument
from bs4 import BeautifulSoup

import fitz
import easyocr
from PIL import Image
import numpy as np
import io

def uni_pro(text):
"""Check if the character is ASCII or falls in the category of non-spacing marks."""
Expand All @@ -36,14 +39,34 @@ def uni_pro(text):

def read_pdf(pdf_path):
"""Read the pdf file."""
pdf_file = open(pdf_path, 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)

text = ''
for num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[num]
text += page.extract_text()
return text
doc = fitz.open(pdf_path)
reader = easyocr.Reader(['en'])
result =''
for i in range(doc.page_count):
page = doc.load_page(i)
pagetext = page.get_text().strip()
if pagetext:
if pagetext.endswith('!') or pagetext.endswith('?') or pagetext.endswith('.'):
result=result+pagetext
else:
result=result+pagetext+'.'
if len(doc.get_page_images(i)) > 0 :
for img in doc.get_page_images(i):
if img:
pageimg=''
xref = img[0]
img_data = doc.extract_image(xref)
img_bytes = img_data['image']
pil_image = Image.open(io.BytesIO(img_bytes))
img = np.array(pil_image)
img_result = reader.readtext(img, paragraph=True, detail=0)
pageimg=pageimg + ', '.join(img_result).strip()
if pageimg.endswith('!') or pageimg.endswith('?') or pageimg.endswith('.'):
pass
else:
pageimg=pageimg+'.'
result=result+pageimg
return result


def read_html(html_path):
Expand Down Expand Up @@ -193,8 +216,8 @@ def load_unstructured_data(input):
elif input.endswith("md"):
text = read_md(input)

text = text.replace('\n', '')
text = text.replace('\n\n', '')
text = text.replace('\n', ' ')
text = text.replace('\n\n', ' ')
text = uni_pro(text)
text = re.sub(r'\s+', ' ', text)
return text
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import multiprocessing
import urllib3
import langid
import PyPDF2
from bs4 import BeautifulSoup
import os
import re
Expand Down Expand Up @@ -193,4 +192,4 @@ def load_html_data(url):

# {'text': all_text, 'main_content': main_content}

return main_content
return main_content
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
beautifulsoup4
chromadb==0.4.15
easyocr
InstructorEmbedding
langchain==0.0.354
langchain_core==0.1.18
langid
markdown
openpyxl
PyPDF2
PyMuPDF
python-docx
qdrant-client
scikit-learn
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ datasets
deepface
diffusers==0.12.1
dlib-bin
easyocr
einops
evaluate
exifread
Expand Down Expand Up @@ -34,7 +35,6 @@ librosa
markdown
neural-compressor
neural_speed
neural_speed
num2words
numba
numpy==1.23.5
Expand All @@ -50,8 +50,8 @@ peft==0.6.2
protobuf==3.20.2
pydantic==1.10.13
pydub
PyMuPDF
pymysql
PyPDF2
python-docx
python-multipart
pyyaml
Expand Down

0 comments on commit d6a66b3

Please sign in to comment.