In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin, urlparse
import time
from datetime import datetime
import logging

# Configuration
INPUT_CSV = "new-websites.csv"
OUTPUT_DIR = "scraped-policies-2"
LOG_FILE = "scraping.log"
RESULTS_JSON = "scraping_results.json"
MAX_RETRIES = 3
TIMEOUT = 15
MAX_THREADS = 3  # Reduced for better stability
MIN_CONTENT_LENGTH = 500  # Minimum chars for valid policy
DELAY_BETWEEN_REQUESTS = 1  # Seconds

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler()
    ]
)

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Rotating user agents for better success rates
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
]

def get_headers(index=0):
    """Get rotating headers to avoid detection."""
    return {
        "User-Agent": USER_AGENTS[index % len(USER_AGENTS)],
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }

def clean_text(text):
    """Remove excessive whitespace and normalize text."""
    # Remove multiple whitespaces, tabs, newlines
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove non-printable characters except basic punctuation
    text = re.sub(r'[^\w\s\.\,\;\:\!\?\-\(\)\[\]\{\}\"\'\/\@\#\$\%\&\*\+\=\<\>\|\~\`]', '', text)
    return text

def extract_gdpr_keywords(text):
    """Extract GDPR-related keywords and phrases for quality assessment."""
    gdpr_keywords = [
        'gdpr', 'general data protection regulation', 'data protection',
        'personal data', 'data subject', 'data controller', 'data processor',
        'consent', 'legitimate interest', 'data breach', 'privacy rights',
        'right to erasure', 'right to rectification', 'data portability',
        'privacy by design', 'data protection officer', 'dpo'
    ]
    
    text_lower = text.lower()
    found_keywords = [kw for kw in gdpr_keywords if kw in text_lower]
    return found_keywords

def is_valid_policy_content(text):
    """Check if scraped content appears to be a valid privacy policy."""
    if len(text) < MIN_CONTENT_LENGTH:
        return False, "Content too short"
    
    # Check for GDPR-related keywords
    gdpr_keywords = extract_gdpr_keywords(text)
    if len(gdpr_keywords) < 3:
        return False, "Insufficient GDPR-related content"
    
    # Check for common policy indicators
    policy_indicators = ['privacy', 'data', 'information', 'personal', 'collect', 'process']
    text_lower = text.lower()
    found_indicators = sum(1 for indicator in policy_indicators if indicator in text_lower)
    
    if found_indicators < 4:
        return False, "Doesn't appear to be privacy policy content"
    
    return True, "Valid policy content"

def scrape_policy(url, domain, attempt_index=0):
    """Scrape and clean privacy policy text with enhanced extraction."""
    headers = get_headers(attempt_index)
    
    for attempt in range(MAX_RETRIES):
        try:
            logging.info(f"Attempting to scrape {domain} (attempt {attempt + 1})")
            
            # Adding delay between requests
            if attempt > 0:
                time.sleep(DELAY_BETWEEN_REQUESTS * (attempt + 1))
            
            response = requests.get(url, headers=headers, timeout=TIMEOUT, allow_redirects=True)
            response.raise_for_status()
            
            # Checking content type
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' not in content_type:
                logging.warning(f"{domain}: Non-HTML content type: {content_type}")
                continue
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove unwanted elements more comprehensively
            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'head', 
                                'iframe', 'button', 'form', 'input', 'select', 'textarea',
                                'noscript', 'aside', 'menu', '.advertisement', '.ads',
                                '.cookie-banner', '.popup']):
                element.decompose()
            
            # Enhanced content selectors (more specific to privacy policies)
            selectors = [
                'main', 'article', '[role="main"]',
                'div.content', 'div.policy', 'div.privacy', 'div.datenschutz',
                '.privacy-policy', '.data-protection', '.legal-content',
                '#privacy', '#policy', '#datenschutz', '#data-protection',
                '.policy-content', '.legal-text', '.privacy-content'
            ]
            
            main_content = None
            for selector in selectors:
                main_content = soup.select_one(selector)
                if main_content and len(main_content.get_text(strip=True)) > MIN_CONTENT_LENGTH:
                    logging.info(f"{domain}: Found content using selector: {selector}")
                    break
            
            if not main_content:
                # Fallback: look for the largest text block
                all_divs = soup.find_all('div')
                if all_divs:
                    main_content = max(all_divs, key=lambda div: len(div.get_text(strip=True)))
                    if len(main_content.get_text(strip=True)) < MIN_CONTENT_LENGTH:
                        main_content = soup.body or soup
                else:
                    main_content = soup.body or soup
            
            # Enhanced text extraction
            text_elements = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 
                                                  'li', 'section', 'div', 'span', 'td'])
            
            # Filter out very short elements and combine text
            meaningful_texts = []
            for elem in text_elements:
                text = elem.get_text(strip=True)
                if len(text) > 20:  # Only include substantial text blocks
                    meaningful_texts.append(text)
            
            if not meaningful_texts:
                # Fallback to all text
                text = main_content.get_text(separator='\n\n', strip=True)
            else:
                text = '\n\n'.join(meaningful_texts)
            
            cleaned_text = clean_text(text)
            
            # Validate content quality
            is_valid, reason = is_valid_policy_content(cleaned_text)
            if not is_valid:
                logging.warning(f"{domain}: Content validation failed: {reason}")
                if attempt < MAX_RETRIES - 1:
                    continue
                else:
                    return None, f"Content validation failed: {reason}"
            
            # Extract metadata
            metadata = {
                'url': url,
                'domain': domain,
                'scraped_at': datetime.now().isoformat(),
                'content_length': len(cleaned_text),
                'gdpr_keywords_found': extract_gdpr_keywords(cleaned_text),
                'final_url': response.url,  # In case of redirects
                'status_code': response.status_code
            }
            
            return cleaned_text, metadata
        
        except requests.exceptions.Timeout:
            logging.warning(f"{domain}: Timeout on attempt {attempt + 1}")
        except requests.exceptions.ConnectionError:
            logging.warning(f"{domain}: Connection error on attempt {attempt + 1}")
        except requests.exceptions.HTTPError as e:
            logging.warning(f"{domain}: HTTP error {e.response.status_code} on attempt {attempt + 1}")
            if e.response.status_code in [403, 404, 429]:
                break  # Don't retry for these errors
        except Exception as e:
            logging.error(f"{domain}: Unexpected error on attempt {attempt + 1}: {str(e)}")
    
    return None, "All attempts failed"

def process_row(row, index):
    """Handle scraping for a single row in CSV."""
    domain = row['domain']
    url = row['privacy_policy_url']
    
    # Add delay between requests
    time.sleep(DELAY_BETWEEN_REQUESTS * (index % MAX_THREADS))
    
    result = scrape_policy(url, domain, index)
    
    if result[0]:  # Success
        policy_text, metadata = result
        
        # Save policy text
        output_path = os.path.join(OUTPUT_DIR, f"{domain.replace('.', '_')}_policy.txt")
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(policy_text)
        
        # Save metadata
        metadata_path = os.path.join(OUTPUT_DIR, f"{domain.replace('.', '_')}_metadata.json")
        with open(metadata_path, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=2)
        
        return {
            'domain': domain,
            'status': 'Success',
            'content_length': len(policy_text),
            'gdpr_keywords': len(metadata['gdpr_keywords_found']),
            'file_path': output_path
        }
    else:
        error_msg = result[1]
        logging.error(f"{domain}: Failed - {error_msg}")
        return {
            'domain': domain,
            'status': 'Failed',
            'error': error_msg,
            'content_length': 0,
            'gdpr_keywords': 0
        }

def main():
    """Main scraping function with comprehensive logging and results tracking."""
    logging.info("Starting GDPR policy scraping...")
    
    # Read CSV
    try:
        with open(INPUT_CSV, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)
    except FileNotFoundError:
        logging.error(f"Input file {INPUT_CSV} not found!")
        return
    
    logging.info(f"Found {len(rows)} websites to scrape")
    
    # Parallel scraping with results tracking
    results = []
    successes = 0
    
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        futures = [executor.submit(process_row, row, i) for i, row in enumerate(rows)]
        
        for future in as_completed(futures):
            result = future.result()
            results.append(result)
            
            if result['status'] == 'Success':
                successes += 1
                logging.info(f"✓ {result['domain']}: {result['content_length']} chars, "
                           f"{result['gdpr_keywords']} GDPR keywords")
            else:
                logging.error(f"✗ {result['domain']}: {result.get('error', 'Unknown error')}")
    
    # Save results
    summary = {
        'total_websites': len(rows),
        'successful_scrapes': successes,
        'failed_scrapes': len(rows) - successes,
        'success_rate': f"{(successes/len(rows)*100):.1f}%",
        'scraped_at': datetime.now().isoformat(),
        'results': results
    }
    
    with open(RESULTS_JSON, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2)
    
    logging.info(f"\n{'='*50}")
    logging.info(f"SCRAPING COMPLETE")
    logging.info(f"{'='*50}")
    logging.info(f"Total websites: {len(rows)}")
    logging.info(f"Successful: {successes}")
    logging.info(f"Failed: {len(rows) - successes}")
    logging.info(f"Success rate: {successes/len(rows)*100:.1f}%")
    logging.info(f"Results saved to: {RESULTS_JSON}")
    logging.info(f"Logs saved to: {LOG_FILE}")

if __name__ == "__main__":
    main()

2025-05-28 19:37:37,195 - INFO - Starting GDPR policy scraping...
2025-05-28 19:37:37,199 - INFO - Found 160 websites to scrape
2025-05-28 19:37:37,202 - INFO - Attempting to scrape google.de (attempt 1)
2025-05-28 19:37:37,760 - INFO - Attempting to scrape facebook.com (attempt 1)
2025-05-28 19:37:37,760 - INFO - ✓ google.de: 580612 chars, 4 GDPR keywords
2025-05-28 19:37:37,955 - INFO - Attempting to scrape facebook.com (attempt 2)
2025-05-28 19:37:38,208 - INFO - Attempting to scrape youtube.com (attempt 1)
2025-05-28 19:37:38,434 - INFO - Attempting to scrape youtube.com (attempt 2)
2025-05-28 19:37:39,207 - INFO - Attempting to scrape amazon.de (attempt 1)
2025-05-28 19:37:39,717 - INFO - amazon.de: Found content using selector: article
2025-05-28 19:37:39,741 - INFO - Attempting to scrape amazon.de (attempt 2)
2025-05-28 19:37:40,071 - INFO - Attempting to scrape facebook.com (attempt 3)
2025-05-28 19:37:40,661 - INFO - Attempting to scrape youtube.com (attempt 3)
2025-05-28 19:3