# PDF License Analyzer
This notebook extracts URLs from PDF pages and analyzes their licenses.

## Install and import required libraries

In [1]:
!pip install pdfplumber requests beautifulsoup4 pandas

Collecting pdfplumber


  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)


Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)


Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.6 MB[0m [31m?[0m eta [36m-:--:--[0m

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m132.1 MB/s[0m eta [36m0:00:00[0m
[?25h

Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m152.8 MB/s[0m eta [36m0:00:00[0m
[?25h

Installing collected packages: pypdfium2, pdfminer.six, pdfplumber


Successfully installed pdfminer.six-20231228 pdfplumber-0.11.5 pypdfium2-4.30.1


In [2]:
import pdfplumber
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

## Function to extract URLs from text

In [3]:
def extract_urls(text):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.findall(url_pattern, text)

## Function to extract text from webpage

In [4]:
def get_webpage_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()
    except:
        return ""

## Main function to analyze PDF

In [5]:
def analyze_pdf_licenses(pdf_path, prompt):
    results = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text and URLs from page
            text = page.extract_text()
            urls = extract_urls(text)
            
            # Analyze each URL
            page_licenses = []
            for url in urls:
                webpage_text = get_webpage_text(url)
                if webpage_text:
                    license = prompt(webpage_text)
                    page_licenses.append({"url": url, "license": license})
            
            results.append(page_licenses)
    
    return results

## Example usage

In [6]:
def mock_prompt(text):
    # Replace this with actual LLM prompt function
    return "MIT License"

# Example analysis
results = analyze_pdf_licenses("example.pdf", mock_prompt)

# Save results
df = pd.DataFrame([(i, item["url"], item["license"]) 
                   for i, page in enumerate(results) 
                   for item in page],
                  columns=["Page", "URL", "License"])
df.to_csv("license_analysis.csv", index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'example.pdf'