<a href="https://colab.research.google.com/github/kutluhanNG/MLcodes/blob/main/PDF_Summarizer_with_Streamlit_and_Huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install streamlit

In [None]:
!pip install PyPDF2

In [None]:
import streamlit as st #Streamlit library for building interactive web apps.
import PyPDF2 #PyPDF2 for reading and extracting text from PDF files.
import re #Importing the built-in Python regular expressions, we will be using it for splitting text into sentences.
from transformers import pipeline

@st.cache_resource # Cache the summarizer so it is loaded only once.
def load_summarizer():
    # Using an open-source summarization model from Hugging Face. You can change the model if you want.
    summarizer = pipeline("summarization", model="google/pegasus-xsum")
    return summarizer

def split_text(text, max_length=1000): #Function to split the sentences and put them into chunks.

    sentences = re.split(r'(?<=[.!?]) +', text) #This line splits the text into sentences.
    chunks = [] #We initialize an empty chunk list to add our sentences.
    current_chunk = "" #We initialize a current chunk string to add to chunk list.
    for sentence in sentences: #Iterating over the splitted sentences one by one.
        if len(current_chunk) + len(sentence) <= max_length: #If the current chunks character size plus the current sentences size do not exceed the chunk size, we add that sentence to current chunk.
            current_chunk += sentence + " "
        else: #If adding the current sentence to current chunks exceeds the chunk's character size limit, we add the current chunk to the list of chunks and continue our process with brand new current chunk and add current sentence there.
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk: #If there is any remaining text in the current chunk after process is complete, we add it to the chunk list too.
        chunks.append(current_chunk.strip())
    return chunks

def extract_text_from_pdf(pdf_file): #Function to extracting text from uploaded PDF.

    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        page_text = page.extract_text() or ""
        text += page_text
    return text

def main():
    st.title("PDF Summarizer") # Sets the title of the web application.
    st.write("Upload a PDF and get a summary of its contents!") # Writes a short description.

    # File uploader widget accepts only PDF files.
    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") # Creates a file uploader widget that only accepts PDF. We will store the uploaded file in the variable uploaded_variable.

    if uploaded_file is not None: # Checks if any file is uploaded.
        with st.spinner("Extracting text from PDF..."): #Displays a spinner with the message “Extracting text from PDF...” while the extraction is in progress.
            text = extract_text_from_pdf(uploaded_file)  # Calls the function that we built above to extract the text from the uploaded PDF file.
        st.success("Text extracted!")
        st.write(f"Extracted {len(text)} characters.")

        if st.button("Summarize"): # Creates a button labeled "Summarize".
            summarizer = load_summarizer() # Calls the summarizing function that we built above.
            with st.spinner("Summarizing text..."): # Displays a spinner with the message.
                if len(text) > 1000: # Checks if the extracted text is longer than 1000 characters.
                    chunks = split_text(text, max_length=1000) # Splits the text into chunks.
                    summary_chunks = [] # Initializes an empty list to store every individual chunk's summary.
                    for chunk in chunks:
                        summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False) # Summarizes each chunk.
                        summary_chunks.append(summary[0]['summary_text']) # Appends the generated summary from the first result.
                    final_summary = " ".join(summary_chunks) # Joins all the individual summaries.
                else: #If the text is not too longi the summarizer runs on the entire text at once.
                    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
                    final_summary = summary[0]['summary_text']

            st.write("### Summary")
            st.write(final_summary)

if __name__ == "__main__":
    main()

In [None]:
!streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py