In [None]:
import os
import requests
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Any
import time
from pathlib import Path
import json
from dataclasses import dataclass

from langchain.tools import BaseTool
from langchain.schema import BaseMessage, HumanMessage, AIMessage
from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from typing_extensions import TypedDict, Annotated
from bs4 import BeautifulSoup
import re

# State definition for the agent
class AgentState(TypedDict):
    messages: Annotated[List[BaseMessage], add_messages]
    url: str
    images: List[Dict[str, str]]
    download_path: str
    status: str
    errors: List[str]

@dataclass
class ImageInfo:
    url: str
    filename: str
    alt_text: str = ""
    photographer: str = ""

class PexelsScraperTool(BaseTool):
    name = "pexels_scraper"
    description = "Scrapes Pexels search page to extract image URLs and metadata"

    def _run(self, url: str) -> Dict[str, Any]:
        """Scrape Pexels page and extract image information"""
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            images = []
            
            # Find image containers - Pexels uses specific CSS classes
            image_containers = soup.find_all('article', class_=re.compile(r'.*photo.*'))
            
            if not image_containers:
                # Alternative approach - look for img tags with src containing pexels
                img_tags = soup.find_all('img', src=re.compile(r'.*pexels.*'))
                for img in img_tags:
                    if img.get('src'):
                        images.append({
                            'url': img['src'],
                            'filename': self._generate_filename(img['src']),
                            'alt_text': img.get('alt', ''),
                            'photographer': ''
                        })
            else:
                for container in image_containers:
                    img_tag = container.find('img')
                    if img_tag and img_tag.get('src'):
                        # Extract photographer info if available
                        photographer_elem = container.find('a', href=re.compile(r'/@.*'))
                        photographer = photographer_elem.text.strip() if photographer_elem else ""
                        
                        images.append({
                            'url': img_tag['src'],
                            'filename': self._generate_filename(img_tag['src']),
                            'alt_text': img_tag.get('alt', ''),
                            'photographer': photographer
                        })
            
            return {
                'success': True,
                'images': images,
                'count': len(images),
                'message': f"Found {len(images)} images"
            }
            
        except Exception as e:
            return {
                'success': False,
                'images': [],
                'count': 0,
                'message': f"Error scraping page: {str(e)}"
            }
    
    def _generate_filename(self, url: str) -> str:
        """Generate a filename from the image URL"""
        parsed = urlparse(url)
        filename = os.path.basename(parsed.path)
        if not filename or not filename.endswith(('.jpg', '.jpeg', '.png', '.webp')):
            # Extract ID from Pexels URL pattern
            match = re.search(r'/(\d+)/', url)
            if match:
                filename = f"pexels_{match.group(1)}.jpg"
            else:
                filename = f"pexels_image_{int(time.time())}.jpg"
        return filename

class ImageDownloaderTool(BaseTool):
    name = "image_downloader"
    description = "Downloads images from URLs to local directory"

    def _run(self, images: List[Dict], download_path: str) -> Dict[str, Any]:
        """Download images to specified directory"""
        try:
            # Create download directory if it doesn't exist
            Path(download_path).mkdir(parents=True, exist_ok=True)
            
            downloaded = []
            failed = []
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            
            for img_info in images:
                try:
                    # Clean up the URL - remove query parameters for better quality
                    clean_url = img_info['url'].split('?')[0]
                    
                    response = requests.get(clean_url, headers=headers, timeout=30)
                    response.raise_for_status()
                    
                    file_path = os.path.join(download_path, img_info['filename'])
                    
                    # Avoid overwriting existing files
                    counter = 1
                    original_path = file_path
                    while os.path.exists(file_path):
                        name, ext = os.path.splitext(original_path)
                        file_path = f"{name}_{counter}{ext}"
                        counter += 1
                    
                    with open(file_path, 'wb') as f:
                        f.write(response.content)
                    
                    downloaded.append({
                        'filename': os.path.basename(file_path),
                        'path': file_path,
                        'photographer': img_info.get('photographer', ''),
                        'alt_text': img_info.get('alt_text', '')
                    })
                    
                    # Small delay to be respectful to the server
                    time.sleep(0.5)
                    
                except Exception as e:
                    failed.append({
                        'url': img_info['url'],
                        'filename': img_info['filename'],
                        'error': str(e)
                    })
                    continue
            
            # Save metadata
            metadata_file = os.path.join(download_path, 'download_metadata.json')
            metadata = {
                'downloaded': downloaded,
                'failed': failed,
                'total_downloaded': len(downloaded),
                'total_failed': len(failed),
                'download_date': time.strftime('%Y-%m-%d %H:%M:%S')
            }
            
            with open(metadata_file, 'w') as f:
                json.dump(metadata, f, indent=2)
            
            return {
                'success': True,
                'downloaded': len(downloaded),
                'failed': len(failed),
                'metadata_file': metadata_file,
                'message': f"Downloaded {len(downloaded)} images, {len(failed)} failed"
            }
            
        except Exception as e:
            return {
                'success': False,
                'downloaded': 0,
                'failed': len(images),
                'message': f"Error downloading images: {str(e)}"
            }

class PexelsDownloaderAgent:
    def __init__(self, openai_api_key: str = None):
        self.scraper_tool = PexelsScraperTool()
        self.downloader_tool = ImageDownloaderTool()
        
        # Initialize LLM (optional - for more intelligent decision making)
        if openai_api_key:
            self.llm = ChatOpenAI(api_key=openai_api_key, model="gpt-3.5-turbo")
        else:
            self.llm = None
        
        # Build the graph
        self.workflow = self._build_workflow()
        self.app = self.workflow.compile()
    
    def _build_workflow(self) -> StateGraph:
        """Build the LangGraph workflow"""
        workflow = StateGraph(AgentState)
        
        # Add nodes
        workflow.add_node("validate_url", self._validate_url)
        workflow.add_node("scrape_images", self._scrape_images)
        workflow.add_node("download_images", self._download_images)
        workflow.add_node("finalize", self._finalize)
        
        # Add edges
        workflow.add_edge("validate_url", "scrape_images")
        workflow.add_edge("scrape_images", "download_images")
        workflow.add_edge("download_images", "finalize")
        workflow.add_edge("finalize", END)
        
        # Set entry point
        workflow.set_entry_point("validate_url")
        
        return workflow
    
    def _validate_url(self, state: AgentState) -> AgentState:
        """Validate that the URL is a Pexels search URL"""
        url = state["url"]
        
        if not url.startswith("https://www.pexels.com"):
            state["status"] = "error"
            state["errors"].append("URL must be from pexels.com")
            return state
        
        state["status"] = "validated"
        state["messages"].append(AIMessage(content=f"Validated URL: {url}"))
        return state
    
    def _scrape_images(self, state: AgentState) -> AgentState:
        """Scrape images from the Pexels page"""
        if state["status"] == "error":
            return state
        
        result = self.scraper_tool._run(state["url"])
        
        if result["success"]:
            state["images"] = result["images"]
            state["status"] = "scraped"
            state["messages"].append(AIMessage(
                content=f"Successfully scraped {result['count']} images from the page"
            ))
        else:
            state["status"] = "error"
            state["errors"].append(result["message"])
            state["messages"].append(AIMessage(
                content=f"Failed to scrape images: {result['message']}"
            ))
        
        return state
    
    def _download_images(self, state: AgentState) -> AgentState:
        """Download the scraped images"""
        if state["status"] == "error" or not state["images"]:
            return state
        
        result = self.downloader_tool._run(state["images"], state["download_path"])
        
        if result["success"]:
            state["status"] = "downloaded"
            state["messages"].append(AIMessage(
                content=f"Downloaded {result['downloaded']} images to {state['download_path']}"
            ))
            if result["failed"] > 0:
                state["messages"].append(AIMessage(
                    content=f"Warning: {result['failed']} images failed to download"
                ))
        else:
            state["status"] = "error"
            state["errors"].append(result["message"])
        
        return state
    
    def _finalize(self, state: AgentState) -> AgentState:
        """Finalize the process and provide summary"""
        if state["status"] == "downloaded":
            summary = f"""
Download completed successfully!
- URL: {state['url']}
- Images found: {len(state['images'])}
- Download location: {state['download_path']}
- Status: {state['status']}
"""
        else:
            summary = f"""
Download failed:
- URL: {state['url']}
- Status: {state['status']}
- Errors: {', '.join(state['errors'])}
"""
        
        state["messages"].append(AIMessage(content=summary))
        return state
    
    def download_images(self, url: str, download_path: str = "./downloads") -> Dict[str, Any]:
        """Main method to download images from Pexels URL"""
        initial_state = {
            "messages": [HumanMessage(content=f"Download images from: {url}")],
            "url": url,
            "images": [],
            "download_path": download_path,
            "status": "started",
            "errors": []
        }
        
        # Run the workflow
        final_state = self.app.invoke(initial_state)
        
        return {
            "status": final_state["status"],
            "images_count": len(final_state["images"]),
            "download_path": final_state["download_path"],
            "errors": final_state["errors"],
            "messages": [msg.content for msg in final_state["messages"]]
        }

# Usage example
def main():
    # Initialize the agent
    agent = PexelsDownloaderAgent()
    
    # Example usage
    pexels_url = "https://www.pexels.com/search/astrophotography/"
    download_folder = "./astrophotography_images"
    
    print("Starting Pexels Image Download Agent...")
    result = agent.download_images(pexels_url, download_folder)
    
    print("\n" + "="*50)
    print("DOWNLOAD RESULTS")
    print("="*50)
    print(f"Status: {result['status']}")
    print(f"Images found: {result['images_count']}")
    print(f"Download path: {result['download_path']}")
    
    if result['errors']:
        print(f"Errors: {result['errors']}")
    
    print("\nAgent Messages:")
    for message in result['messages']:
        print(f"- {message}")

if __name__ == "__main__":
    main()