# Libraries

In [None]:
!pip install PyPDF2                 # to manipulate PDF documents by splitting, merging, cropping
!pip install pycryptodome           # for encryption, decryption, hashing
!pip install PyMuPDF                # PDF viewer -  extract text, and manipulate PDF content
!pip install pdfminer.six           # for extracting text, images, and metadata from PDF files
!pip install pdf2image              # converts PDF files into a sequence of images
!apt-get install -y tesseract-ocr
!pip install pytesseract            # Python wrapper for Google's Tesseract-OCR Engine
!pip install gTTS                   # convert text into speech using Google's voice synthesis
!pip install poppler-utils          # includes tools for converting PDFs to different formats (PDF to PNG)
!pip install transformers           # simplifying the implementation of NLP tasks by offering pre-trained models and tools

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 23 not upgraded.


In [None]:
# creation, reading, and extraction of ZIP archive
from zipfile import ZipFile

# for extracting text, images, and metadata from PDF files
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

from io import StringIO

# module for encoding and decoding binary data using base64 encoding.
import base64

#------- OCR ------------
import pdf2image
import pytesseract
import fitz   # module in PyMuPDF
import os     # to interact with the os, including functions for file manipulation and directory operations
import glob   # for pattern matching files and directories

from pdf2image import convert_from_path
from pytesseract import Output, TesseractError
# from tkinter import Tk, Frame, Button, filedialog
from PyPDF2 import PdfReader
from PIL import Image         # Pillow library, working with images, including opening, manipulating, and saving images in various formats
from google.colab import files
from gtts import gTTS
from IPython.display import Audio
from transformers import pipeline

import tensorflow as tf
from tensorflow.keras import datasets,layers,models
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
import cv2
from keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16

from sklearn.model_selection import train_test_split

import random

from google.colab import drive
drive.mount('/content/drive')

CLASSES_LIST=["aadhar","passport","driver license","pan","voter"]
IMG_SIZE=224

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# OCR engine - Tesseract for png file

In [None]:
def png_to_text():
    folder_path = "/content/output_images"
    png_files = glob.glob(f"{folder_path}/*.png")
    extracted_text = {}

    for file_name in png_files:
        img = Image.open(file_name)
        text = pytesseract.image_to_string(img)
        extracted_text[file_name] = text

    return extracted_text

# Convert PDF to IMAGE


In [None]:
def convert_pdf_to_images(pdf_path, output_folder, menu_option):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf_document = fitz.open(pdf_path)

    for page_number in range(pdf_document.page_count):
        page = pdf_document.load_page(page_number)
        image_matrix = page.get_pixmap()
        img = Image.frombytes("RGB", [image_matrix.width, image_matrix.height], image_matrix.samples)
        image_path = f"{output_folder}/page_{page_number + 1}.png"
        img.save(image_path, "PNG")

    pdf_document.close()
    print(f"PDF pages converted to images in the folder: {output_folder}")

    # if classification of document ; do not need text extraction
    if(menu_option == 3):
      return

    text = png_to_text()
    return text

# Uploading files

In [None]:
def upload_pdf_and_convert(menu_option):
    uploaded = files.upload()
    pdf_path = None

    for name, data in uploaded.items():
        if name.endswith('.pdf'):
            with open(name, 'wb') as f:
                f.write(data)
                pdf_path = name
                print(f"Uploaded {name}")

    if pdf_path:
        output_folder = 'output_images'
        text = convert_pdf_to_images(pdf_path, output_folder, menu_option)
        return text
    else:
        print("\n-> Please upload a PDF file.")


#Text to speech

In [None]:
def text_to_audio():
    print("-> Please upload a PDF file.")
    menu_option = 1
    extracted_text = upload_pdf_and_convert(menu_option)

    cleaned_text = [text.replace('\n', ' ') for text in extracted_text.values()]

    print("Extracted text : ",cleaned_text)

    combined_text = ' '.join(cleaned_text)
    tts = gTTS(text=combined_text, lang='en')
    tts.save('output.mp3')
    print("\n\n# Audio has been created!\n\n")
    return Audio('output.mp3', autoplay=True)

# Read Aloud Function


In [None]:
def click_read_aloud():

    print("\nRead Aloud option chosen\n")
    text_to_audio()


# Document Summary Function

In [None]:
def summarize_text(text):
    summarizer = pipeline("summarization")
    summary = summarizer(text, max_length=150, min_length=30, do_sample=False)
    return summary[0]['summary_text']

def summarize_extracted_text(extracted_text):
    summarized_text = {}
    for file_name, text in extracted_text.items():
        summarized_text[file_name] = summarize_text(text)
    return summarized_text

In [None]:
def click_document_summary():
    print("\nDocument Summary option chosen\n")
    print("-> Please upload a PDF file.")
    menu_option = 2
    extracted_text = upload_pdf_and_convert(menu_option)

    summarized_text = summarize_extracted_text(extracted_text)

    cleaned_text = [text.replace('\n', '') for text in extracted_text]
    print("\nExtracted text : ",cleaned_text)

    for file_name, summary in summarized_text.items():
      print(f"Summary of {file_name}:\n{summary}\n{'-'*50}")

# Document Classification Function

In [None]:
def classify_document_images(image_folder, model):
    # Iterate through each item in the folder
    for item in os.listdir(image_folder):
        item_path = os.path.join(image_folder, item)

        # Check if the item is a file
        if os.path.isfile(item_path):
            # Load and preprocess the image
            img = image.load_img(item_path, target_size=(IMG_SIZE, IMG_SIZE))
            img_array = image.img_to_array(img)
            img_array = np.expand_dims(img_array, axis=0)
            img_array /= 255.0  # Normalize the image

            # Make a prediction
            prediction = model.predict(img_array)
            predicted_class = np.argmax(prediction)
            predicted_class_name = CLASSES_LIST[predicted_class]

            print(f"Item: {item}, Predicted Class: {predicted_class_name}")

In [None]:
def click_document_classification():

    print("\nDocument Classification option chosen")
    menu_option = 3
    upload_pdf_and_convert(menu_option)

    model = load_model('finalmodel.h5')

    classify_images_folder = '/content/output_images'
    classify_document_images(classify_images_folder, model)

# Main Menu

In [None]:
def exit_menu_page():
    print("Exiting!!")

In [None]:
def main_menu():
    print("Welcome to the Main Menu")
    print("1. Read Aloud")
    print("2. Document Summary")
    print("3. Document Classification")
    print("4. Exit")

    while True:
        try:
            option = int(input("Please enter the option number: "))
            if option == 1:
                click_read_aloud()
            elif option == 2:
                click_document_summary()
            elif option == 3:
                click_document_classification()
            elif option == 4:
                exit_menu_page()
                break  # Exit the loop
            else:
                print("\n # Invalid option. Please enter a number between 1 and 4.")
        except ValueError:
            print("\n # Invalid input. Please enter a number.")

# Calling the menu function
main_menu()

Welcome to the Main Menu
1. Read Aloud
2. Document Summary
3. Document Classification
4. Exit

Read Aloud option chosen

-> Please upload a PDF file.


Saving audio testing and summarization.pdf to audio testing and summarization.pdf
Uploaded audio testing and summarization.pdf
PDF pages converted to images in the folder: output_images
Extracted text :  ['1.2 Evolution of Conversational AI: An In-depth Exploration     Overs  ‘The evolution of conversational AI represents the journey from rudimentary rule-based chatbots to advanced systems capable of understanding context, generating human-like responses, and handling ‘complex interactions. This progression has been driven by advancements in machine leaning, neural networks, and an increasing emphasis on contextual understanding.  Objectives: 1. Improve User Experience: The primary objective has always been to enhance user interactions by ‘making chathots more natura, intelligent, and capable of understanding user intent.  2. Adaptability: As conversational Al progressed the focus shifted to creating systems that can adapt and lear from user interactions, enabling them t handle a wide 

Saving audio testing and summarization.pdf to audio testing and summarization (1).pdf
Uploaded audio testing and summarization (1).pdf
PDF pages converted to images in the folder: output_images


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.



Extracted text :  ['/content/output_images/page_1.png', '/content/output_images/page_2.png']
Summary of /content/output_images/page_1.png:
 Evolution of Conversational AI represents the journey from rudimentary rule-based chatbots to advanced systems capable of understanding context, generating human-like responses, and handling complex interactions . Modern AI systems aim to understand the context of a conversation, considering previous interactions and user intent .
--------------------------------------------------
Summary of /content/output_images/page_2.png:
 Rule-based, scripted and neural network-based chatbot chatbots are among the most realistic challenges to AI chatbots . Google's "Meena" chatbot, based on Transformer architecture, demonstrated significant improvements in natural language generation .
--------------------------------------------------

Document Classification option chosen


Saving test pdf.pdf to test pdf.pdf
Uploaded test pdf.pdf
PDF pages converted to images in the folder: output_images
Item: page_10.png, Predicted Class: driver license
Item: page_8.png, Predicted Class: driver license
Item: page_11.png, Predicted Class: aadhar
Item: page_7.png, Predicted Class: aadhar
Item: page_4.png, Predicted Class: voter
Item: page_1.png, Predicted Class: driver license
Item: page_5.png, Predicted Class: driver license
Item: page_9.png, Predicted Class: aadhar
Item: page_2.png, Predicted Class: voter
Item: page_6.png, Predicted Class: driver license
Item: page_3.png, Predicted Class: aadhar
