In [3]:
# ! pip install PyPDF2
# ! pip install transformers
# ! pip install torch
# ! pip install nltk
# ! pip install gtts

In [4]:
import PyPDF2
import nltk
from transformers import pipeline
import re
import os
from gtts import gTTS

# Download NLTK resources for text preprocessing
nltk.download('punkt')
nltk.download('stopwords')

# Initialize summarizer model (BART or T5)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Text cleaning function
def clean_text(text):
    # Remove unwanted characters (e.g., extra spaces, special characters, etc.)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# PDF Text Extraction Function
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
    return text

# Function to generate summary from text
def generate_summary(text, max_length=200, min_length=50):
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']

# Function to create text-to-speech for the summary
def text_to_speech(summary, language='en'):
    tts = gTTS(text=summary, lang=language, slow=False)
    filename = "summary_audio.mp3"
    tts.save(filename)
    print(f"Audio summary saved as {filename}")
    return filename

# Main function for PDF summary generation
def main(pdf_path):
    # Step 1: Extract text from the PDF file
    print("Extracting text from PDF...")
    raw_text = extract_text_from_pdf(pdf_path)

    # Step 2: Clean the extracted text
    cleaned_text = clean_text(raw_text)

    # Step 3: Generate the summary
    print("Generating summary...")
    summary = generate_summary(cleaned_text)

    # Step 4: Print summary
    print("\nGenerated Summary:")
    print(summary)

    # Step 5: Optionally, convert summary to speech
    print("\nConverting summary to audio...")
    audio_file = text_to_speech(summary)

    return summary, audio_file

# Example: Process a PDF file
if __name__ == "__main__":
    # Replace with your own PDF file path
    pdf_path = 'Automatic Summary Generator for Study Materials.pdf'
    main(pdf_path)

[nltk_data] Downloading package punkt to /Users/anuragm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anuragm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Device set to use mps:0


Extracting text from PDF...
Generating summary...

Generated Summary:
Automatic Summary Generator for Study Materials uses Natural Language Processing NLP techniques. System allows users to upload study materials in the form of PDF files or raw text processes the content through a pretrained NLP model such as BAR T or T5.

Converting summary to audio...
Audio summary saved as summary_audio.mp3


# Detailed Explanation:
## PDF Text Extraction:

We use PyPDF2 to read the content of a PDF file. The function extract_text_from_pdf(pdf_path) reads all pages of the PDF and extracts the text.

## Text Preprocessing:

The clean_text(text) function cleans the extracted text by removing unwanted characters, such as extra spaces and special symbols, using regular expressions.

## Text Summarization:

The generate_summary(text) function uses the Hugging Face transformers pipeline with a pre-trained BART model to generate a concise summary of the text. You can also use models like T5 (t5-small) or other summarization models.

## Text-to-Speech Conversion:

The text_to_speech(summary) function uses the Google Text-to-Speech (gTTS) library to convert the summary into an audio file (.mp3 format), making it easier for students to listen to the summary for revision.

## Main Program:

The main(pdf_path) function ties everything together: it reads the PDF, cleans the text, generates a summary, and optionally converts the summary to speech.

4. Optional Feature Enhancements:
Keyword Extraction: You can use libraries like RAKE or Spacy to extract keywords from the text. This would help students identify important concepts for deeper revision.

Quiz Generation: To automatically generate quizzes, you could use NLP techniques like Named Entity Recognition (NER) or keyword extraction to form questions like "What is the definition of X?" or "Who discovered Y?". Libraries like transformers can be used for generating contextual questions.

Multiple File Formats: Extend the tool to accept other formats like .txt, .docx, etc., by using appropriate libraries like python-docx or textract.

5. Example Output:
Assuming you upload a PDF file, the system will extract the text, generate a summary, and create an audio file. The result might look like this:

plaintext
Copy
Edit
Generated Summary:
This paper discusses the advancements in AI, machine learning, and neural networks. It highlights recent breakthroughs in natural language processing and computer vision technologies. Key areas of focus include self-driving cars, intelligent assistants, and AI-powered medical diagnostics. These innovations are expected to revolutionize multiple industries in the coming years.

Converting summary to audio...
Audio summary saved as summary_audio.mp3
6. Running the Code:
Ensure that the pdf_path variable is set to the location of the PDF you want to summarize.

The code will generate a text summary and, if enabled, produce an audio file (summary_audio.mp3).

This implementation provides the basic structure for an Automatic Summary Generator for Study Materials. You can further expand it by adding additional features like keyword extraction, automatic quiz generation, and integration with more file formats.