In [1]:
import transformers
from transformers import pipeline
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from nltk.tokenize import word_tokenize
from skimage.filters import threshold_otsu
import cv2
import numpy as np
import gradio as gr
import magic
from PyPDF2 import PdfFileReader
from docx import Document
from pptx import Presentation
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
from heapq import *
import warnings
 
# Ignore all DeprecationWarnings, UserWarnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

processor = TrOCRProcessor.from_pretrained('ocr_processor')
model = VisionEncoderDecoderModel.from_pretrained('ocr_model')
summarizer=pipeline("summarization",model="text_summarizer")
pytesseract.pytesseract.tesseract_cmd = r"Tesseract-OCR\tesseract.exe"

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
def horizontal_projections(sobel_image):
    return np.sum(sobel_image, axis=1)  

def sauvola_thresholding(image, window_size=25, k=0.4, R=128):
    
    # Convert image to float
    image = image.astype(np.float64)
    
    # Calculate the mean and standard deviation for each window
    mean = cv2.boxFilter(image, cv2.CV_64F, (window_size, window_size))
    sq_mean = cv2.boxFilter(image**2, cv2.CV_64F, (window_size, window_size))
    variance = sq_mean - mean**2
    stddev = np.sqrt(variance)
    
    # Calculate Sauvola threshold
    threshold = mean * (1 + k * ((stddev / R) - 1))
    
    # Apply threshold to binarize the image
    binary_image = (image > threshold).astype(np.uint8) * 255
    return 255-binary_image

def heuristic(a, b):
    return (b[0] - a[0]) ** 2 + (b[1] - a[1]) ** 2

def astar(array, start, goal):
    neighbors = [(0,1),(0,-1),(1,0),(-1,0),(1,1),(1,-1),(-1,1),(-1,-1)]
    close_set = set()
    came_from = {}
    gscore = {start:0}
    fscore = {start:heuristic(start, goal)}
    oheap = []
    heappush(oheap, (fscore[start], start))
    while oheap:
        current = heappop(oheap)[1]
        if current == goal:
            data = []
            while current in came_from:
                data.append(current)
                current = came_from[current]
            return data
        close_set.add(current)
        for i, j in neighbors:
            neighbor = current[0] + i, current[1] + j            
            tentative_g_score = gscore[current] + heuristic(current, neighbor)
            if 0 <= neighbor[0] < array.shape[0]:
                if 0 <= neighbor[1] < array.shape[1]:                
                    if array[neighbor[0]][neighbor[1]] == 1:
                        continue
                else:
                    # array bound y walls
                    continue
            else:
                # array bound x walls
                continue
                
            if neighbor in close_set and tentative_g_score >= gscore.get(neighbor, 0):
                continue
                
            if  tentative_g_score < gscore.get(neighbor, 0) or neighbor not in [i[1]for i in oheap]:
                came_from[neighbor] = current
                gscore[neighbor] = tentative_g_score
                fscore[neighbor] = tentative_g_score + heuristic(neighbor, goal)
                heappush(oheap, (fscore[neighbor], neighbor))               
    return []

# find the midway where we can make a threshold and extract the peaks regions
def find_peak_regions(hpp, threshold):
    peaks = []
    for i, hppv in enumerate(hpp):
        if hppv < threshold:
            peaks.append([i, hppv])
    return peaks

def get_binary(img):
    mean = np.mean(img)
    if mean == 0.0 or mean == 1.0:
        return img

    thresh = threshold_otsu(img)
    binary = img <= thresh
    binary = binary * 1
    return binary

def edges_detect(img_gray):
    img_blur = cv2.GaussianBlur(img_gray, (3,3), 0)
    # Sobel Edge Detection
    sobelx = cv2.Sobel(src=img_blur, ddepth=cv2.CV_64F, dx=1, dy=0, ksize=3)
    sobely = cv2.Sobel(src=img_blur, ddepth=cv2.CV_64F, dx=0, dy=1, ksize=3)
    sobelxy = cv2.Sobel(src=img_blur, ddepth=cv2.CV_64F, dx=1, dy=1, ksize=3)
    edges = cv2.Canny(image=img_blur, threshold1=100, threshold2=200)
    return edges

def image_preprocess(img):
    # check if its 3 channel or grayscale, based on that convert to grayscale
    gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    #remove the noise
    blurred_image = cv2.GaussianBlur(img, (3, 3), 0)
    edges = edges_detect(blurred_image)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 7))
    dilated_image = cv2.dilate(edges, kernel, iterations=1)

    # Apply Sauvola thresholding
    binary_image = sauvola_thresholding(dilated_image)
    _, binary_image = cv2.threshold(binary_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    #edges = edges_detect(blurred_image)
    #kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 7))
    #dilated_image = cv2.dilate(edges, kernel, iterations=1)
    
    # Apply Sauvola thresholding
    #_, binary_image = cv2.threshold(binary_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    return binary_image

def extract_handwritten_text_images(filepath):
    img = cv2.imread(filepath)
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    original_img=img.copy()
    binary_image = image_preprocess(img)  
    hpp = horizontal_projections(binary_image)

    # find the threshold from where anything above is considered a peak region
    threshold =(np.max(hpp)-np.min(hpp))//2
    peaks = find_peak_regions(hpp, threshold)

    peaks_indexes = np.array(peaks)[:, 0].astype(int)

    # group the peaks through which we will be doing path planning.
    diff_between_consec_numbers = np.diff(peaks_indexes) 

    # difference between consecutive numbers
    indexes_with_larger_diff = np.where(diff_between_consec_numbers > 1)[0].flatten()
    peak_groups = np.split(peaks_indexes, indexes_with_larger_diff)

    # remove very small regions, these are basically errors in algorithm because of our threshold value
    peak_groups = [item for item in peak_groups if len(item) > 1]
    
    #binarize the image
    img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    binary_image = get_binary(img)

    # now that everything is cleaner, its time to segment all the lines using the A* algorithm
    segment_separating_lines = []
    for i, sub_image_index in enumerate(peak_groups):
        nmap = binary_image[sub_image_index[0]:sub_image_index[-1]]
        path = np.array(astar(nmap, (int(nmap.shape[0]/2), 0), (int(nmap.shape[0]/2),nmap.shape[1]-1)))
        offset_from_top = sub_image_index[0]
        path[:,0] += offset_from_top
        segment_separating_lines.append(path)

    # Lets divide the image now by the line segments passing through the image
    first_line = int(np.max(segment_separating_lines[0][:,0]))
    seperated_images = [original_img[:first_line]]
    for index, line_segments in enumerate(segment_separating_lines):
        if index < len(segment_separating_lines)-1:
            lower_line =int(np.max(segment_separating_lines[index][:,0]))
            upper_line = int(np.max(segment_separating_lines[index+1][:,0]))
            seperated_images.append(original_img[lower_line:upper_line])
    text=""
    for line_image in seperated_images:
        text=text+" "+extract_handwritten(line_image)
    return text

def extract_handwritten(img):
    pixel_values = processor(images=img, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    text=generated_text
    return text

#Extract pdf file consists of Typed text
def extract_text_pdf(filepath, is_handwritten):
    if is_handwritten:
        return extract_handwritten_text_pdf(filepath)
    else:
        return extract_image_text_pdf(filepath)

#Extract pdf file consists of scanned images
def extract_image_text_pdf(filepath):
    try:
        doc = convert_from_path(filepath,poppler_path=r"poppler-24.02.0\Library\bin")
        text=""
        for page_number, page_data in enumerate(doc):
            text+=pytesseract.image_to_string(page_data)
        return text
    except Exception as e:
        return "Error: Unable to extract text from pdf scanned document. "+str(e)

#Extract pdf file consists of handwritten text
def extract_handwritten_text_pdf(filepath):
    try:
        images = convert_pdf_to_images(filepath,poppler_path=r"poppler-24.02.0\Library\bin")
        text = ""
        for img in images:
            text += extract_handwritten_text_images(img)
        return text
    except Exception as e:
        return "Error: Unable to extract text from pdf handwritten document. "+str(e)

def extract_text_docx(filepath):
    try:
        with open(filepath, 'rb') as doc_file:
            doc = Document(doc_file)
            full_text = []
            for paragraph in doc.paragraphs:
                full_text.append(paragraph.text)
            return "\n".join(full_text).strip()
    except Exception as e:
        return "Error: Unable to extract text from Word document. "+ str(e)

def extract_text_pptx(filepath):
    try:
        ppt = Presentation(filepath)
        slides = ppt.slides
        text = ""
        for slide in slides:
            shapes = slide.shapes
            for shape in shapes:
                if shape.has_text_frame:
                    text_frame = shape.text_frame
                    text += text_frame.text + "\n"
        return text.strip()
    except Exception as e:
        return "Error: Unable to extract text from PowerPoint document. "+ str(e)

def extract_text_image(filename,is_handwritten):
    try:
        
        if is_handwritten:
            return extract_handwritten_text_images(filename)
        else:
            img = cv2.imread(filename)
            return pytesseract.image_to_string(img)
    except Exception as e:
        return "Error: Unable to extract text from image. " +str(e) 

def convert_to_bullet_points(text):
    sentences = text.split('. ')
    bullet_points = sentences[:]
    return '\n'.join(f"- {point.strip()}" for point in bullet_points)

def extract_text(filepath, is_handwritten):
    mime = magic.Magic(mime=True)
    mime_type = mime.from_file(filepath)

    if mime_type == "application/pdf":
        return extract_text_pdf(filepath, is_handwritten)
    
    elif mime_type in ("application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"):
        return extract_text_docx(filepath)
    
    elif mime_type in ("application/vnd.ms-powerpoint", "application/vnd.openxmlformats-officedocument.presentationml.presentation"):
        return extract_text_pptx(filepath)
    
    elif mime_type in ("image/jpeg", "image/png"):
        return extract_text_image(filepath,is_handwritten)
    
    else:
        return "Error: Unsupported file type"


def summarize_text(file, is_handwritten, length):
    text = extract_text(file.name, is_handwritten)

    if not text or "Error:" in text:
            return text

    if not text.strip():
        raise ValueError("Input text is empty. Please provide a valid text.")

    # Tokenize the text
    words = word_tokenize(text)

    # Check if tokenization was successful
    if not words:
        raise ValueError("Tokenization failed or resulted in an empty list.")
    
    # Count the words
    word_count = len(words)
    if word_count>50:
        # Calculate max_length and min_length for the summary
        max_length = int(word_count*(length/100))
        min_length = int(max_length * 0.5)  # Ensure min_length is at least 1
    else:
        max_length = int(word_count)
        min_length = int(max_length * 0.5)

    # Generate the summary
    summary = summarizer(text, min_length=min_length, max_length=max_length)

    # Extract the summarized text
    summarized_text = summary[0]['summary_text']
    return convert_to_bullet_points(summarized_text)

In [3]:
with gr.Blocks(theme=gr.themes.Soft(),css=""".title {text-align: center; font-size: 24px; margin-bottom: 20px;} .description {text-align:left; font-size: 16px; margin-bottom: 20px;} .file-input-wrapper .gr-file {height: auto; max-height:20px;overflow: hidden;}""") as demo:
    gr.HTML("""<div class='title'>Notes Summarization</div>""")
    gr.HTML("""<div class='description'>
    Follow the steps below to summarize your notes:<br>
    1. Upload your file.<br>
    2. Specify if it is handwritten.<br>
    3. Choose the desired summary length.<br>
    4. Click the 'Summarize' button and wait for a few minutes.</div>""")
    
    with gr.Row():
        file_input = gr.File(label="Upload File", file_types=["pdf", "docx", "pptx", "jpeg", "png"],elem_id="file-input" ,interactive=True)
        with gr.Column():
            is_handwritten = gr.Checkbox(label="Handwritten?", interactive=True, info="is the doc contains handritten text.")
            length = gr.Slider(50, 100, step=10, label="Summary Length", interactive=True)
    
    examples = gr.Examples(
        examples=["Files/SOFTWARE ENGINEERING.pdf","Files/Yolo.docx", "Files/tensor.pptx"],
        inputs=[file_input],
        label="typed/scanned Examples"
    )
    
    examples = gr.Examples(
        examples=[
            ["images/handwritten.jpg", True]
        ],
        inputs=[file_input, is_handwritten],
        label="handwritten Example"
    )
        
    summary_output = gr.Textbox(label="Summary",lines=5,max_lines=20, interactive=False)
    submit_button = gr.Button("Summarize")
    
    submit_button.click(
        summarize_text, 
        inputs=[file_input, is_handwritten, length], 
        outputs=summary_output
    )

demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://70464cf4e8dd3ffa49.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [6]:
#from transformers import pipeline
#from transformers import TrOCRProcessor, VisionEncoderDecoderModel

#processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
#model =VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
#summarizer=pipeline("summarization",model="pszemraj/led-base-book-summary")

#processor.save_pretrained("ocr_processor")
#model.save_pretrained("ocr_model")
#summarizer.save_pretrained("text_summarizer")

config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]