# PDF License Analyzer
This notebook extracts URLs from PDF pages and analyzes their licenses.

## Install and import required libraries

In [1]:
!pip install pdfplumber requests beautifulsoup4 pandas fpdf2



Collecting fpdf2


  Downloading fpdf2-2.8.2-py2.py3-none-any.whl.metadata (67 kB)


Downloading fpdf2-2.8.2-py2.py3-none-any.whl (236 kB)


Installing collected packages: fpdf2


Successfully installed fpdf2-2.8.2


In [2]:
import pdfplumber
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

## Function to extract URLs from text

In [3]:
def extract_urls(text):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.findall(url_pattern, text)

## Function to extract text from webpage

In [4]:
def get_webpage_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()
    except:
        return ""

## Main function to analyze PDF

In [5]:
def analyze_pdf_licenses(pdf_path, prompt):
    results = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text and URLs from page
            text = page.extract_text()
            urls = extract_urls(text)
            
            # Analyze each URL
            page_licenses = []
            for url in urls:
                webpage_text = get_webpage_text(url)
                if webpage_text:
                    license = prompt(webpage_text)
                    page_licenses.append({"url": url, "license": license})
            
            results.append(page_licenses)
    
    return results

## Example usage

In [6]:
def mock_prompt(text):
    # Replace this with actual LLM prompt function
    return "MIT License"

# Create a sample PDF file for testing
from fpdf import FPDF
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="Sample text with URL: https://github.com/haesleinhuepf/napari-tools-menu", ln=1, align="L")
sample_pdf_path = "sample.pdf"
pdf.output(sample_pdf_path)

# Analyze the sample PDF
results = analyze_pdf_licenses(sample_pdf_path, mock_prompt)

# Save results
df = pd.DataFrame([(i, item["url"], item["license"]) 
                   for i, page in enumerate(results) 
                   for item in page],
                  columns=["Page", "URL", "License"])
df.to_csv("license_analysis.csv", index=False)

  pdf.set_font("Arial", size=12)
  pdf.cell(200, 10, txt="Sample text with URL: https://github.com/haesleinhuepf/napari-tools-menu", ln=1, align="L")
  pdf.cell(200, 10, txt="Sample text with URL: https://github.com/haesleinhuepf/napari-tools-menu", ln=1, align="L")
