# PDF License Analyzer
This notebook extracts URLs from PDF pages and analyzes their licenses.

## Import required libraries

In [1]:
import pdfplumber
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

ModuleNotFoundError: No module named 'pdfplumber'

## Function to extract URLs from text

In [None]:
def extract_urls(text):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    return re.findall(url_pattern, text)

## Function to extract text from webpage

In [None]:
def get_webpage_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text()
    except:
        return ""

## Main function to analyze PDF

In [None]:
def analyze_pdf_licenses(pdf_path, prompt):
    results = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text and URLs from page
            text = page.extract_text()
            urls = extract_urls(text)
            
            # Analyze each URL
            page_licenses = []
            for url in urls:
                webpage_text = get_webpage_text(url)
                if webpage_text:
                    license = prompt(webpage_text)
                    page_licenses.append({"url": url, "license": license})
            
            results.append(page_licenses)
    
    return results

## Example usage

In [None]:
def mock_prompt(text):
    # Replace this with actual LLM prompt function
    return "MIT License"

# Example analysis
results = analyze_pdf_licenses("example.pdf", mock_prompt)

# Save results
df = pd.DataFrame([(i, item["url"], item["license"]) 
                   for i, page in enumerate(results) 
                   for item in page],
                  columns=["Page", "URL", "License"])
df.to_csv("license_analysis.csv", index=False)