In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin, urlparse
import time
from datetime import datetime
import logging

INPUT_CSV = "new-websites.csv"
OUTPUT_DIR = "scraped-policies-2"
LOG_FILE = "scraping.log"
RESULTS_JSON = "scraping_results.json"
MAX_RETRIES = 3
TIMEOUT = 15
MAX_THREADS = 3  
MIN_CONTENT_LENGTH = 500  
DELAY_BETWEEN_REQUESTS = 1 

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler()
    ]
)

os.makedirs(OUTPUT_DIR, exist_ok=True)

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
]

def get_headers(index=0):
    """Get rotating headers to avoid detection."""
    return {
        "User-Agent": USER_AGENTS[index % len(USER_AGENTS)],
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }

def clean_text(text):
    """Remove excessive whitespace and normalize text."""
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^\w\s\.\,\;\:\!\?\-\(\)\[\]\{\}\"\'\/\@\#\$\%\&\*\+\=\<\>\|\~\`]', '', text)
    return text

def extract_gdpr_keywords(text):
    """Extract GDPR-related keywords and phrases for quality assessment."""
    gdpr_keywords = [
        'gdpr', 'general data protection regulation', 'data protection',
        'personal data', 'data subject', 'data controller', 'data processor',
        'consent', 'legitimate interest', 'data breach', 'privacy rights',
        'right to erasure', 'right to rectification', 'data portability',
        'privacy by design', 'data protection officer', 'dpo'
    ]
    
    text_lower = text.lower()
    found_keywords = [kw for kw in gdpr_keywords if kw in text_lower]
    return found_keywords

def is_valid_policy_content(text):
    """Check if scraped content appears to be a valid privacy policy."""
    if len(text) < MIN_CONTENT_LENGTH:
        return False, "Content too short"
    
    gdpr_keywords = extract_gdpr_keywords(text)
    if len(gdpr_keywords) < 3:
        return False, "Insufficient GDPR-related content"
    
    # Check for common policy indicators
    policy_indicators = ['privacy', 'data', 'information', 'personal', 'collect', 'process']
    text_lower = text.lower()
    found_indicators = sum(1 for indicator in policy_indicators if indicator in text_lower)
    
    if found_indicators < 4:
        return False, "Doesn't appear to be privacy policy content"
    
    return True, "Valid policy content"

def scrape_policy(url, domain, attempt_index=0):
    """Scrape and clean privacy policy text with enhanced extraction."""
    headers = get_headers(attempt_index)
    
    for attempt in range(MAX_RETRIES):
        try:
            logging.info(f"Attempting to scrape {domain} (attempt {attempt + 1})")
            
            # Adding delay between requests
            if attempt > 0:
                time.sleep(DELAY_BETWEEN_REQUESTS * (attempt + 1))
            
            response = requests.get(url, headers=headers, timeout=TIMEOUT, allow_redirects=True)
            response.raise_for_status()
            
            # Checking content type
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' not in content_type:
                logging.warning(f"{domain}: Non-HTML content type: {content_type}")
                continue
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove unwanted elements more comprehensively
            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'head', 
                                'iframe', 'button', 'form', 'input', 'select', 'textarea',
                                'noscript', 'aside', 'menu', '.advertisement', '.ads',
                                '.cookie-banner', '.popup']):
                element.decompose()
            
            # Enhanced content selectors (more specific to privacy policies)
            selectors = [
                'main', 'article', '[role="main"]',
                'div.content', 'div.policy', 'div.privacy', 'div.datenschutz',
                '.privacy-policy', '.data-protection', '.legal-content',
                '#privacy', '#policy', '#datenschutz', '#data-protection',
                '.policy-content', '.legal-text', '.privacy-content'
            ]
            
            main_content = None
            for selector in selectors:
                main_content = soup.select_one(selector)
                if main_content and len(main_content.get_text(strip=True)) > MIN_CONTENT_LENGTH:
                    logging.info(f"{domain}: Found content using selector: {selector}")
                    break
            
            if not main_content:
                # Fallback: look for the largest text block
                all_divs = soup.find_all('div')
                if all_divs:
                    main_content = max(all_divs, key=lambda div: len(div.get_text(strip=True)))
                    if len(main_content.get_text(strip=True)) < MIN_CONTENT_LENGTH:
                        main_content = soup.body or soup
                else:
                    main_content = soup.body or soup
            
            # Enhanced text extraction
            text_elements = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 
                                                  'li', 'section', 'div', 'span', 'td'])
            
            # Filter out very short elements and combine text
            meaningful_texts = []
            for elem in text_elements:
                text = elem.get_text(strip=True)
                if len(text) > 20:  # Only include substantial text blocks
                    meaningful_texts.append(text)
            
            if not meaningful_texts:
                # Fallback to all text
                text = main_content.get_text(separator='\n\n', strip=True)
            else:
                text = '\n\n'.join(meaningful_texts)
            
            cleaned_text = clean_text(text)
            
            # Validate content quality
            is_valid, reason = is_valid_policy_content(cleaned_text)
            if not is_valid:
                logging.warning(f"{domain}: Content validation failed: {reason}")
                if attempt < MAX_RETRIES - 1:
                    continue
                else:
                    return None, f"Content validation failed: {reason}"
            
            # Extract metadata
            metadata = {
                'url': url,
                'domain': domain,
                'scraped_at': datetime.now().isoformat(),
                'content_length': len(cleaned_text),
                'gdpr_keywords_found': extract_gdpr_keywords(cleaned_text),
                'final_url': response.url,  # In case of redirects
                'status_code': response.status_code
            }
            
            return cleaned_text, metadata
        
        except requests.exceptions.Timeout:
            logging.warning(f"{domain}: Timeout on attempt {attempt + 1}")
        except requests.exceptions.ConnectionError:
            logging.warning(f"{domain}: Connection error on attempt {attempt + 1}")
        except requests.exceptions.HTTPError as e:
            logging.warning(f"{domain}: HTTP error {e.response.status_code} on attempt {attempt + 1}")
            if e.response.status_code in [403, 404, 429]:
                break  # Don't retry for these errors
        except Exception as e:
            logging.error(f"{domain}: Unexpected error on attempt {attempt + 1}: {str(e)}")
    
    return None, "All attempts failed"

def process_row(row, index):
    """Handle scraping for a single row in CSV."""
    domain = row['domain']
    url = row['privacy_policy_url']
    
    # Add delay between requests
    time.sleep(DELAY_BETWEEN_REQUESTS * (index % MAX_THREADS))
    
    result = scrape_policy(url, domain, index)
    
    if result[0]:  # Success
        policy_text, metadata = result
        
        # Save policy text
        output_path = os.path.join(OUTPUT_DIR, f"{domain.replace('.', '_')}_policy.txt")
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(policy_text)
        
        # Save metadata
        metadata_path = os.path.join(OUTPUT_DIR, f"{domain.replace('.', '_')}_metadata.json")
        with open(metadata_path, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=2)
        
        return {
            'domain': domain,
            'status': 'Success',
            'content_length': len(policy_text),
            'gdpr_keywords': len(metadata['gdpr_keywords_found']),
            'file_path': output_path
        }
    else:
        error_msg = result[1]
        logging.error(f"{domain}: Failed - {error_msg}")
        return {
            'domain': domain,
            'status': 'Failed',
            'error': error_msg,
            'content_length': 0,
            'gdpr_keywords': 0
        }

def main():
    """Main scraping function with comprehensive logging and results tracking."""
    logging.info("Starting GDPR policy scraping...")
    
    # Read CSV
    try:
        with open(INPUT_CSV, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)
    except FileNotFoundError:
        logging.error(f"Input file {INPUT_CSV} not found!")
        return
    
    logging.info(f"Found {len(rows)} websites to scrape")
    
    # Parallel scraping with results tracking
    results = []
    successes = 0
    
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        futures = [executor.submit(process_row, row, i) for i, row in enumerate(rows)]
        
        for future in as_completed(futures):
            result = future.result()
            results.append(result)
            
            if result['status'] == 'Success':
                successes += 1
                logging.info(f"✓ {result['domain']}: {result['content_length']} chars, "
                           f"{result['gdpr_keywords']} GDPR keywords")
            else:
                logging.error(f"✗ {result['domain']}: {result.get('error', 'Unknown error')}")
    
    # Save results
    summary = {
        'total_websites': len(rows),
        'successful_scrapes': successes,
        'failed_scrapes': len(rows) - successes,
        'success_rate': f"{(successes/len(rows)*100):.1f}%",
        'scraped_at': datetime.now().isoformat(),
        'results': results
    }
    
    with open(RESULTS_JSON, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2)
    
    logging.info(f"\n{'='*50}")
    logging.info(f"SCRAPING COMPLETE")
    logging.info(f"{'='*50}")
    logging.info(f"Total websites: {len(rows)}")
    logging.info(f"Successful: {successes}")
    logging.info(f"Failed: {len(rows) - successes}")
    logging.info(f"Success rate: {successes/len(rows)*100:.1f}%")
    logging.info(f"Results saved to: {RESULTS_JSON}")
    logging.info(f"Logs saved to: {LOG_FILE}")

if __name__ == "__main__":
    main()

2025-05-28 19:37:37,195 - INFO - Starting GDPR policy scraping...
2025-05-28 19:37:37,199 - INFO - Found 160 websites to scrape
2025-05-28 19:37:37,202 - INFO - Attempting to scrape google.de (attempt 1)
2025-05-28 19:37:37,760 - INFO - Attempting to scrape facebook.com (attempt 1)
2025-05-28 19:37:37,760 - INFO - ✓ google.de: 580612 chars, 4 GDPR keywords
2025-05-28 19:37:37,955 - INFO - Attempting to scrape facebook.com (attempt 2)
2025-05-28 19:37:38,208 - INFO - Attempting to scrape youtube.com (attempt 1)
2025-05-28 19:37:38,434 - INFO - Attempting to scrape youtube.com (attempt 2)
2025-05-28 19:37:39,207 - INFO - Attempting to scrape amazon.de (attempt 1)
2025-05-28 19:37:39,717 - INFO - amazon.de: Found content using selector: article
2025-05-28 19:37:39,741 - INFO - Attempting to scrape amazon.de (attempt 2)
2025-05-28 19:37:40,071 - INFO - Attempting to scrape facebook.com (attempt 3)
2025-05-28 19:37:40,661 - INFO - Attempting to scrape youtube.com (attempt 3)
2025-05-28 19:3

Another approach(above one is also connect), below is a different approach

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import os
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin, urlparse
import time
from datetime import datetime
import logging
from collections import defaultdict

INPUT_CSV = "new-websites.csv"
OUTPUT_DIR = "scraped-policies-3"
LOG_FILE = "scraping.log"
RESULTS_JSON = "scraping_results.json"
MAX_RETRIES = 3
TIMEOUT = 15
MAX_THREADS = 3
MIN_CONTENT_LENGTH = 500
DELAY_BETWEEN_REQUESTS = 1

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler()
    ]
)

os.makedirs(OUTPUT_DIR, exist_ok=True)

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15"
]

GDPR_ARTICLE_PATTERNS = {
    "Article_6_Lawful_Basis": {
        "keywords": [
            "lawful basis", "legal basis", "consent", "legitimate interest", "vital interests",
            "public task", "legal obligation", "contract", "contractual necessity",
            "processing is necessary", "lawful ground", "legal ground"
        ],
        "phrases": [
            "we process your data based on",
            "legal basis for processing",
            "we rely on the following legal bases",
            "processing is justified",
            "lawful basis under article 6"
        ]
    },
    "Article_7_Consent": {
        "keywords": [
            "consent", "withdraw consent", "consent management", "opt-in", "opt-out",
            "consent preferences", "consent settings", "consent banner", "cookie consent",
            "freely given", "specific consent", "informed consent", "unambiguous consent"
        ],
        "phrases": [
            "you have the right to withdraw consent",
            "consent can be withdrawn",
            "manage your consent",
            "consent preferences",
            "you can change your mind"
        ]
    },
    "Article_12_Transparent_Information": {
        "keywords": [
            "transparent", "clear information", "plain language", "understandable",
            "accessible format", "free of charge", "without delay", "transparent information"
        ],
        "phrases": [
            "we provide clear information",
            "transparent about how we use",
            "information provided free of charge",
            "in a clear and plain manner"
        ]
    },
    "Article_13_Information_Collection": {
        "keywords": [
            "data collection", "information we collect", "what data we collect",
            "types of data", "categories of data", "personal information collected",
            "data obtained", "information gathering"
        ],
        "phrases": [
            "information we collect from you",
            "data we collect includes",
            "types of personal data we collect",
            "we collect the following information"
        ]
    },
    "Article_14_Information_Third_Parties": {
        "keywords": [
            "third party", "external sources", "data from other sources",
            "information from partners", "third party data", "data brokers"
        ],
        "phrases": [
            "data obtained from third parties",
            "information from external sources",
            "data from other companies",
            "third party sources"
        ]
    },
    "Article_15_Right_of_Access": {
        "keywords": [
            "right of access", "access your data", "request your data", "data access",
            "view your information", "obtain a copy", "access request", "data portability"
        ],
        "phrases": [
            "you have the right to access",
            "request access to your personal data",
            "obtain a copy of your data",
            "access your personal information",
            "right to know what data we hold"
        ]
    },
    "Article_16_Right_to_Rectification": {
        "keywords": [
            "rectification", "correct your data", "update information", "amend data",
            "data correction", "inaccurate data", "incomplete data", "modify information"
        ],
        "phrases": [
            "right to rectification",
            "correct inaccurate data",
            "update your information",
            "have your data corrected",
            "amend personal data"
        ]
    },
    "Article_17_Right_to_Erasure": {
        "keywords": [
            "right to erasure", "right to be forgotten", "delete your data", "data deletion",
            "remove personal data", "erase information", "account deletion", "data removal"
        ],
        "phrases": [
            "right to be forgotten",
            "delete your personal data",
            "erase your information",
            "remove your data",
            "have your data deleted"
        ]
    },
    "Article_18_Right_to_Restriction": {
        "keywords": [
            "restriction of processing", "limit processing", "restrict data use",
            "suspend processing", "processing restriction", "data restriction"
        ],
        "phrases": [
            "restrict the processing",
            "limit how we use your data",
            "suspend data processing",
            "restriction of processing"
        ]
    },
    "Article_20_Data_Portability": {
        "keywords": [
            "data portability", "export your data", "transfer data", "data export",
            "portable format", "machine readable", "structured format", "data transfer"
        ],
        "phrases": [
            "right to data portability",
            "export your data",
            "transfer your data",
            "portable format",
            "machine-readable format"
        ]
    },
    "Article_21_Right_to_Object": {
        "keywords": [
            "right to object", "object to processing", "opt out", "object to use",
            "stop processing", "object to marketing", "direct marketing"
        ],
        "phrases": [
            "right to object",
            "object to the processing",
            "opt out of processing",
            "object to direct marketing"
        ]
    },
    "Article_25_Data_Protection_by_Design": {
        "keywords": [
            "privacy by design", "data protection by design", "privacy by default",
            "built-in privacy", "privacy safeguards", "technical measures",
            "organizational measures", "data minimization"
        ],
        "phrases": [
            "privacy by design",
            "data protection by design",
            "built-in privacy protections",
            "privacy by default"
        ]
    },
    "Article_32_Security_of_Processing": {
        "keywords": [
            "security measures", "data security", "technical safeguards", "encryption",
            "access controls", "security breach", "data breach", "cybersecurity",
            "secure processing", "security protocols", "data protection measures"
        ],
        "phrases": [
            "appropriate security measures",
            "protect your data",
            "security safeguards",
            "encryption and security",
            "secure data processing"
        ]
    },
    "Article_33_34_Data_Breach": {
        "keywords": [
            "data breach", "security breach", "breach notification", "incident response",
            "breach reporting", "security incident", "data incident", "unauthorized access"
        ],
        "phrases": [
            "in case of a data breach",
            "security breach notification",
            "data breach procedures",
            "incident response"
        ]
    },
    "Article_35_Data_Protection_Impact_Assessment": {
        "keywords": [
            "data protection impact assessment", "dpia", "privacy impact assessment",
            "impact assessment", "privacy assessment", "risk assessment"
        ],
        "phrases": [
            "data protection impact assessment",
            "privacy impact assessment",
            "conduct impact assessments"
        ]
    },
    "Article_37_Data_Protection_Officer": {
        "keywords": [
            "data protection officer", "dpo", "privacy officer", "data protection contact",
            "privacy contact", "data protection authority"
        ],
        "phrases": [
            "data protection officer",
            "contact our dpo",
            "privacy officer",
            "data protection contact"
        ]
    },
    "Retention_and_Storage": {
        "keywords": [
            "data retention", "retention period", "storage period", "how long we keep",
            "retention policy", "data storage", "delete after", "retention schedule"
        ],
        "phrases": [
            "how long we keep your data",
            "data retention period",
            "retain your information",
            "storage and retention"
        ]
    },
    "Third_Party_Sharing": {
        "keywords": [
            "data sharing", "third party sharing", "share with partners", "data recipients",
            "service providers", "processors", "vendors", "partners", "affiliates"
        ],
        "phrases": [
            "share your data with",
            "third party recipients",
            "data sharing practices",
            "service providers and partners"
        ]
    },
    "International_Transfers": {
        "keywords": [
            "international transfer", "cross-border transfer", "adequacy decision",
            "standard contractual clauses", "binding corporate rules", "third countries"
        ],
        "phrases": [
            "transfer data internationally",
            "cross-border data transfers",
            "international data transfers",
            "transfer to third countries"
        ]
    },
    "Cookies_and_Tracking": {
        "keywords": [
            "cookies", "tracking", "analytics", "advertising", "targeting",
            "web beacons", "pixels", "tracking technologies", "cookie policy"
        ],
        "phrases": [
            "use of cookies",
            "tracking technologies",
            "cookie preferences",
            "advertising cookies"
        ]
    }
}

def get_headers(index=0):
    return {
        "User-Agent": USER_AGENTS[index % len(USER_AGENTS)],
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }

def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^\w\s\.\,\;\:\!\?\-\(\)\[\]\{\}\"\'\/\@\#\$\%\&\*\+\=\<\>\|\~\`]', '', text)
    return text

def classify_gdpr_content(text):
    text_lower = text.lower()
    classifications = defaultdict(list)
    article_scores = defaultdict(int)
    
    sentences = re.split(r'[.!?]+', text)
    
    for sentence_idx, sentence in enumerate(sentences):
        if len(sentence.strip()) < 20:
            continue
            
        sentence_lower = sentence.lower().strip()
        sentence_classifications = []
        
        for article, patterns in GDPR_ARTICLE_PATTERNS.items():
            score = 0
            
            for keyword in patterns["keywords"]:
                if keyword in sentence_lower:
                    score += 1
            
            for phrase in patterns["phrases"]:
                if phrase in sentence_lower:
                    score += 3
            
            if score > 0:
                sentence_classifications.append({
                    'article': article,
                    'score': score,
                    'sentence_index': sentence_idx,
                    'sentence': sentence.strip()
                })
                article_scores[article] += score
        
        if sentence_classifications:
            sentence_classifications.sort(key=lambda x: x['score'], reverse=True)
            top_score = sentence_classifications[0]['score']
            
            for classification in sentence_classifications:
                if classification['score'] == top_score:
                    classifications[classification['article']].append({
                        'sentence': classification['sentence'],
                        'sentence_index': classification['sentence_index'],
                        'confidence_score': classification['score']
                    })
    
    return dict(classifications), dict(article_scores)

def extract_gdpr_keywords(text):
    gdpr_keywords = [
        'gdpr', 'general data protection regulation', 'data protection',
        'personal data', 'data subject', 'data controller', 'data processor',
        'consent', 'legitimate interest', 'data breach', 'privacy rights',
        'right to erasure', 'right to rectification', 'data portability',
        'privacy by design', 'data protection officer', 'dpo'
    ]
    
    text_lower = text.lower()
    found_keywords = [kw for kw in gdpr_keywords if kw in text_lower]
    return found_keywords

def is_valid_policy_content(text):
    if len(text) < MIN_CONTENT_LENGTH:
        return False, "Content too short"
    
    gdpr_keywords = extract_gdpr_keywords(text)
    if len(gdpr_keywords) < 3:
        return False, "Insufficient GDPR-related content"
    
    policy_indicators = ['privacy', 'data', 'information', 'personal', 'collect', 'process']
    text_lower = text.lower()
    found_indicators = sum(1 for indicator in policy_indicators if indicator in text_lower)
    
    if found_indicators < 4:
        return False, "Doesn't appear to be privacy policy content"
    
    return True, "Valid policy content"

def scrape_policy(url, domain, attempt_index=0):
    headers = get_headers(attempt_index)
    
    for attempt in range(MAX_RETRIES):
        try:
            logging.info(f"Attempting to scrape {domain} (attempt {attempt + 1})")
            
            if attempt > 0:
                time.sleep(DELAY_BETWEEN_REQUESTS * (attempt + 1))
            
            response = requests.get(url, headers=headers, timeout=TIMEOUT, allow_redirects=True)
            response.raise_for_status()
            
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' not in content_type:
                logging.warning(f"{domain}: Non-HTML content type: {content_type}")
                continue
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            for element in soup(['script', 'style', 'nav', 'footer', 'header', 'head', 
                                'iframe', 'button', 'form', 'input', 'select', 'textarea',
                                'noscript', 'aside', 'menu', '.advertisement', '.ads',
                                '.cookie-banner', '.popup']):
                element.decompose()
            
            selectors = [
                'main', 'article', '[role="main"]',
                'div.content', 'div.policy', 'div.privacy', 'div.datenschutz',
                '.privacy-policy', '.data-protection', '.legal-content',
                '#privacy', '#policy', '#datenschutz', '#data-protection',
                '.policy-content', '.legal-text', '.privacy-content'
            ]
            
            main_content = None
            for selector in selectors:
                main_content = soup.select_one(selector)
                if main_content and len(main_content.get_text(strip=True)) > MIN_CONTENT_LENGTH:
                    logging.info(f"{domain}: Found content using selector: {selector}")
                    break
            
            if not main_content:
                all_divs = soup.find_all('div')
                if all_divs:
                    main_content = max(all_divs, key=lambda div: len(div.get_text(strip=True)))
                    if len(main_content.get_text(strip=True)) < MIN_CONTENT_LENGTH:
                        main_content = soup.body or soup
                else:
                    main_content = soup.body or soup
            
            text_elements = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 
                                                  'li', 'section', 'div', 'span', 'td'])
            
            meaningful_texts = []
            for elem in text_elements:
                text = elem.get_text(strip=True)
                if len(text) > 20:
                    meaningful_texts.append(text)
            
            if not meaningful_texts:
                text = main_content.get_text(separator='\n\n', strip=True)
            else:
                text = '\n\n'.join(meaningful_texts)
            
            cleaned_text = clean_text(text)
            
            is_valid, reason = is_valid_policy_content(cleaned_text)
            if not is_valid:
                logging.warning(f"{domain}: Content validation failed: {reason}")
                if attempt < MAX_RETRIES - 1:
                    continue
                else:
                    return None, f"Content validation failed: {reason}"
            
            gdpr_classifications, article_scores = classify_gdpr_content(cleaned_text)
            
            metadata = {
                'url': url,
                'domain': domain,
                'scraped_at': datetime.now().isoformat(),
                'content_length': len(cleaned_text),
                'gdpr_keywords_found': extract_gdpr_keywords(cleaned_text),
                'final_url': response.url,
                'status_code': response.status_code,
                'gdpr_article_classifications': gdpr_classifications,
                'article_scores': article_scores,
                'total_classified_sentences': sum(len(sentences) for sentences in gdpr_classifications.values())
            }
            
            return cleaned_text, metadata
        
        except requests.exceptions.Timeout:
            logging.warning(f"{domain}: Timeout on attempt {attempt + 1}")
        except requests.exceptions.ConnectionError:
            logging.warning(f"{domain}: Connection error on attempt {attempt + 1}")
        except requests.exceptions.HTTPError as e:
            logging.warning(f"{domain}: HTTP error {e.response.status_code} on attempt {attempt + 1}")
            if e.response.status_code in [403, 404, 429]:
                break
        except Exception as e:
            logging.error(f"{domain}: Unexpected error on attempt {attempt + 1}: {str(e)}")
    
    return None, "All attempts failed"

def save_classified_content(domain, policy_text, metadata):
    base_filename = domain.replace('.', '_')
    
    policy_path = os.path.join(OUTPUT_DIR, f"{base_filename}_policy.txt")
    with open(policy_path, 'w', encoding='utf-8') as f:
        f.write(policy_text)
    
    metadata_path = os.path.join(OUTPUT_DIR, f"{base_filename}_metadata.json")
    with open(metadata_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2)
    
    classifications_dir = os.path.join(OUTPUT_DIR, f"{base_filename}_classifications")
    os.makedirs(classifications_dir, exist_ok=True)
    
    for article, sentences in metadata['gdpr_article_classifications'].items():
        if sentences:  # Only create file if there are sentences for this article
            article_path = os.path.join(classifications_dir, f"{article}.txt")
            with open(article_path, 'w', encoding='utf-8') as f:
                f.write(f"=== {article.replace('_', ' ').title()} ===\n\n")
                for i, sentence_data in enumerate(sentences, 1):
                    f.write(f"Sentence {i} (Confidence: {sentence_data['confidence_score']}):\n")
                    f.write(f"{sentence_data['sentence']}\n\n")
    
    summary_path = os.path.join(classifications_dir, "classification_summary.txt")
    with open(summary_path, 'w', encoding='utf-8') as f:
        f.write(f"GDPR Article Classification Summary for {domain}\n")
        f.write("="*60 + "\n\n")
        
        for article, score in sorted(metadata['article_scores'].items(), key=lambda x: x[1], reverse=True):
            if score > 0:
                sentence_count = len(metadata['gdpr_article_classifications'].get(article, []))
                f.write(f"{article.replace('_', ' ').title()}: {score} points, {sentence_count} sentences\n")
        
        f.write(f"\nTotal classified sentences: {metadata['total_classified_sentences']}\n")
        f.write(f"Total content length: {metadata['content_length']} characters\n")

def process_row(row, index):
    domain = row['domain']
    url = row['privacy_policy_url']
    
    time.sleep(DELAY_BETWEEN_REQUESTS * (index % MAX_THREADS))
    
    result = scrape_policy(url, domain, index)
    
    if result[0]:
        policy_text, metadata = result
        
        save_classified_content(domain, policy_text, metadata)
        
        return {
            'domain': domain,
            'status': 'Success',
            'content_length': len(policy_text),
            'gdpr_keywords': len(metadata['gdpr_keywords_found']),
            'classified_articles': list(metadata['gdpr_article_classifications'].keys()),
            'total_classified_sentences': metadata['total_classified_sentences'],
            'top_articles': sorted(metadata['article_scores'].items(), key=lambda x: x[1], reverse=True)[:5]
        }
    else:
        error_msg = result[1]
        logging.error(f"{domain}: Failed - {error_msg}")
        return {
            'domain': domain,
            'status': 'Failed',
            'error': error_msg,
            'content_length': 0,
            'gdpr_keywords': 0,
            'classified_articles': [],
            'total_classified_sentences': 0
        }

def main():
    logging.info("Starting GDPR policy scraping with article classification...")
    
    try:
        with open(INPUT_CSV, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)
    except FileNotFoundError:
        logging.error(f"Input file {INPUT_CSV} not found!")
        return
    
    logging.info(f"Found {len(rows)} websites to scrape")
    
    results = []
    successes = 0
    
    with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
        futures = [executor.submit(process_row, row, i) for i, row in enumerate(rows)]
        
        for future in as_completed(futures):
            result = future.result()
            results.append(result)
            
            if result['status'] == 'Success':
                successes += 1
                top_articles = [f"{art}: {score}" for art, score in result['top_articles']]
                logging.info(f"✓ {result['domain']}: {result['content_length']} chars, "
                           f"{result['total_classified_sentences']} classified sentences, "
                           f"Top articles: {', '.join(top_articles[:3])}")
            else:
                logging.error(f"✗ {result['domain']}: {result.get('error', 'Unknown error')}")
    
    summary = {
        'total_websites': len(rows),
        'successful_scrapes': successes,
        'failed_scrapes': len(rows) - successes,
        'success_rate': f"{(successes/len(rows)*100):.1f}%",
        'scraped_at': datetime.now().isoformat(),
        'results': results,
        'gdpr_articles_tracked': list(GDPR_ARTICLE_PATTERNS.keys())
    }
    
    with open(RESULTS_JSON, 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=2)
    
    #logging.info(f"\n{'='*50}")
    #logging.info(f"SCRAPING COMPLETE")
    #logging.info(f"{'='*50}")
    #logging.info(f"Total websites: {len(rows)}")
    #logging.info(f"Successful: {successes}")
    #logging.info(f"Failed: {len(rows) - successes}")
    #logging.info(f"Success rate: {successes/len(rows)*100:.1f}%")
    #logging.info(f"Results saved to: {RESULTS_JSON}")
    #logging.info(f"Logs saved to: {LOG_FILE}")
    #logging.info(f"Classified content saved to individual domain folders")

if __name__ == "__main__":
    main()

2025-06-21 18:13:39,200 - INFO - Starting GDPR policy scraping with article classification...
2025-06-21 18:13:39,203 - INFO - Found 160 websites to scrape
2025-06-21 18:13:39,205 - INFO - Attempting to scrape google.de (attempt 1)
2025-06-21 18:13:40,215 - INFO - Attempting to scrape youtube.com (attempt 1)
2025-06-21 18:13:40,319 - INFO - Attempting to scrape facebook.com (attempt 1)
2025-06-21 18:13:40,319 - INFO - ✓ google.de: 580612 chars, 908 classified sentences, Top articles: Cookies_and_Tracking: 327, Article_6_Lawful_Basis: 268, Third_Party_Sharing: 149
2025-06-21 18:13:40,569 - INFO - Attempting to scrape youtube.com (attempt 2)
2025-06-21 18:13:40,590 - INFO - Attempting to scrape facebook.com (attempt 2)
2025-06-21 18:13:41,207 - INFO - Attempting to scrape amazon.de (attempt 1)
2025-06-21 18:13:41,664 - INFO - amazon.de: Found content using selector: article
2025-06-21 18:13:41,687 - INFO - Attempting to scrape amazon.de (attempt 2)
2025-06-21 18:13:42,807 - INFO - Attemp

In the below code we are trying to extract the important articles

In [None]:
import json

with open("gdpr_articles_baseline.json", "r") as f:
    all_articles = json.load(f)

essential_articles = {'6','7','8','9', '12', '13','14','15','16', '17', '18', '19', '20', '21','22','23','24','25','26','27','28','29','30','31','32','37'}

filtered_articles = [
    article for article in all_articles
    if str(article["article_number"]) in essential_articles
]

with open("gdpr_articles_filtered.json", "w") as f:
    json.dump(filtered_articles, f, indent=2)

print(f"Saved {len(filtered_articles)} filtered articles to 'gdpr_articles_filtered.json'")

✅ Saved 26 filtered articles to 'gdpr_articles_filtered.json'


Further aggregating the data

In [None]:
import json

with open("gdpr_articles_filtered.json", "r") as f:
    articles = json.load(f)

for article in articles:
    combined = " ".join(f"{k} {v}" for sec in article.get("sections", []) for k, v in sec.items())
    article["content"] = combined
    article.pop("sections", None)  # Remove the original "sections" key

with open("gdpr_articles_flattened.json", "w") as f:
    json.dump(articles, f, indent=2)

print(f"Converted {len(articles)} articles to use 'content' instead of 'sections'")


✅ Converted 26 articles to use 'content' instead of 'sections'
