<a href="https://colab.research.google.com/github/krishnamalani1164/opiniion_analyzer/blob/main/Griplex_digital.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
#Approach

In [9]:
import nltk
nltk.download('punkt_tab', quiet=True)

True

In [10]:
# Install the required libraries for PDF processing and NLP
!pip install PyPDF2 pdfplumber nltk



In [11]:
import os
import shutil
import zipfile
import re
import PyPDF2
import pdfplumber
import nltk
from PyPDF2 import PdfWriter, PdfReader
from google.colab import files

# Download the necessary NLTK resource
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)  # This is the crucial fix

class OpinionExtractor:
    def __init__(self):
        # Create a working directory
        self.work_dir = "/content/work"
        self.extracted_dir = os.path.join(self.work_dir, "extracted")
        if not os.path.exists(self.work_dir):
            os.makedirs(self.work_dir)
        if not os.path.exists(self.extracted_dir):
            os.makedirs(self.extracted_dir)

    def upload_zip_file(self):
        """Prompts the user to upload a ZIP file and returns its name and content."""
        print("Please upload the ZIP file containing newspapers...")
        uploaded = files.upload()
        zip_filename = list(uploaded.keys())[0]
        zip_content = uploaded[zip_filename]
        # Save the uploaded file to the working directory
        zip_path = os.path.join(self.work_dir, zip_filename)
        with open(zip_path, "wb") as f:
            f.write(zip_content)
        return zip_path, zip_content

    def _extract_text_from_page(self, pdf_path, page_num):
        """Extracts text from a specific page of a PDF."""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                page = pdf.pages[page_num]
                text = page.extract_text()
                return text if text else ""
        except Exception as e:
            print(f"Error extracting text from page {page_num} of {pdf_path}: {e}")
            return ""

    def _is_opinion_page(self, text):
        """Checks if the page text contains keywords commonly found on opinion/editorial pages."""
        opinion_keywords = [
            "opinion", "editorial", "editorials", "op-ed", "letters to the editor", "views"
        ]
        text_lower = text.lower()
        if any(re.search(r'\b' + re.escape(keyword) + r'\b', text_lower) for keyword in opinion_keywords):
            return True
        return False

    def process_newspapers(self, zip_path):
        """
        Unzips files, processes each PDF, and extracts opinion pages.
        Returns a list of paths to the extracted opinion PDFs and a count of processed files.
        """
        extracted_pdfs = []
        processed_count = 0

        # Unzip the uploaded file
        print(f"Extracted to {self.extracted_dir}")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(self.extracted_dir)

        newspaper_files = [f for f in os.listdir(self.extracted_dir) if f.endswith('.pdf')]

        print("\nProcessing newspapers...")
        for filename in newspaper_files:
            processed_count += 1
            input_path = os.path.join(self.extracted_dir, filename)

            try:
                reader = PdfReader(input_path)
                num_pages = len(reader.pages)

                print(f"\nProcessing: {filename}")
                found_pages_in_file = 0

                for i in range(num_pages):
                    page_text = self._extract_text_from_page(input_path, i)
                    if self._is_opinion_page(page_text):
                        writer = PdfWriter()
                        writer.add_page(reader.pages[i])
                        output_filename = f"{os.path.basename(filename)}_page_{i+1}.pdf"
                        output_path = os.path.join(self.work_dir, output_filename)

                        with open(output_path, "wb") as output_pdf:
                            writer.write(output_pdf)

                        extracted_pdfs.append(output_path)
                        found_pages_in_file += 1

                if found_pages_in_file == 0:
                    print(f"  ❌ No opinion pages found in {filename}")

            except Exception as e:
                print(f"Error processing {input_path}: \n{e}")
                print(f"  ❌ No opinion pages found in {filename}")

        return extracted_pdfs, processed_count

    def merge_pdfs(self, pdf_list, output_path):
        """Merges a list of PDF files into a single consolidated PDF."""
        merger = PyPDF2.PdfMerger()
        for pdf in pdf_list:
            merger.append(pdf)

        with open(output_path, "wb") as output_file:
            merger.write(output_file)

    def cleanup(self):
        """Removes the temporary working directory."""
        if os.path.exists(self.work_dir):
            shutil.rmtree(self.work_dir)
            print("\n✅ Temporary files cleaned up.")

In [12]:
# Cell 3: Main execution function
def main():
    extractor = OpinionExtractor()

    try:
        # Upload ZIP file
        zip_filename, zip_content = extractor.upload_zip_file()
        print(f"Uploaded: {zip_filename} ({len(zip_content)} bytes)")

        # Process the newspapers
        extracted_pdfs, processed_count = extractor.process_newspapers(zip_filename)

        # Merge all extracted opinion PDFs
        if extracted_pdfs:
            final_output = "/content/work/consolidated_opinions.pdf"
            extractor.merge_pdfs(extracted_pdfs, final_output)

            print(f"\n🎉 Successfully processed {processed_count} files!")
            print(f"📄 Created consolidated PDF: {final_output}")

            # Download the final file
            print("\n📥 Downloading final PDF...")
            files.download(final_output)

            # Also show some stats
            print(f"\n📊 Statistics:")
            print(f"   - Total opinion PDFs extracted: {len(extracted_pdfs)}")
            print(f"   - Total files processed: {processed_count}")

        else:
            print("❌ No opinion pages found in any newspaper")

    except Exception as e:
        print(f"❌ Error: {e}")
        print("Please make sure you uploaded a valid ZIP file containing PDF newspapers.")

# Run the main function
if __name__ == "__main__":
    main()

Please upload the ZIP file containing newspapers...


Saving newspapers.zip to newspapers (3).zip
Uploaded: /content/work/newspapers (3).zip (67473285 bytes)
Extracted to /content/work/extracted

Processing newspapers...

Processing: The Hindu HD-19.pdf

Processing: IE-Mumbai 19-09.pdf
  ❌ No opinion pages found in IE-Mumbai 19-09.pdf

Processing: FE - Mumbai  - 19-09-2025.pdf




  ❌ No opinion pages found in FE - Mumbai  - 19-09-2025.pdf

Processing: Tribune_TheTribune_19-09-2025.pdf
  ❌ No opinion pages found in Tribune_TheTribune_19-09-2025.pdf

Processing: Bengaluru_Mint_19-09-2025.pdf





🎉 Successfully processed 5 files!
📄 Created consolidated PDF: /content/work/consolidated_opinions.pdf

📥 Downloading final PDF...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


📊 Statistics:
   - Total opinion PDFs extracted: 10
   - Total files processed: 5
