In [2]:
import requests
import json
import pandas as pd
from urllib.parse import quote_plus
from warcio.archiveiterator import ArchiveIterator
import re
import concurrent.futures
import time
from bs4 import BeautifulSoup
import tqdm


SERVER = 'http://index.commoncrawl.org/'
INDEX_NAME = 'CC-MAIN-2024-33' 
USER_AGENT = 'australian-website-extractor/1.0 (Data research project)'


AU_DOMAIN_SUFFIXES = [
    '.com.au',
    '.net.au',
    '.gov.au',
    '.org.au',
    '.edu.au'
]

def search_cc_index(domain_suffix, limit=100):
    """
    Search the Common Crawl index for URLs with the specified domain suffix.
    Returns a list of records containing URLs and their locations in WARC files.
    """
    encoded_url = quote_plus(f'*{domain_suffix}')
    index_url = f'{SERVER}{INDEX_NAME}-index?url={encoded_url}&output=json&limit={limit}'
    
    try:
        response = requests.get(index_url, headers={'user-agent': USER_AGENT})
        if response.status_code == 200:
            records = response.text.strip().split('\n')
            return [json.loads(record) for record in records if record.strip()]
        else:
            print(f"Failed to search index for {domain_suffix}: HTTP {response.status_code}")
            return []
    except Exception as e:
        print(f"Error searching index for {domain_suffix}: {e}")
        return []

def fetch_warc_record(record):
    """
    Fetch a specific WARC record using the information from an index record.
    Returns the HTML content of the record.
    """
    try:
        offset, length = int(record['offset']), int(record['length'])
        s3_url = f'https://data.commoncrawl.org/{record["filename"]}'
        byte_range = f'bytes={offset}-{offset+length-1}'
        
        response = requests.get(
            s3_url, 
            headers={'user-agent': USER_AGENT, 'Range': byte_range},
            stream=True,
            timeout=10
        )
        
        if response.status_code == 206:  # Partial Content
            stream = ArchiveIterator(response.raw)
            for warc_record in stream:
                if warc_record.rec_type == 'response':
                    content = warc_record.content_stream().read()
                    try:
                        return content.decode('utf-8', errors='ignore')
                    except UnicodeDecodeError:
                        return content.decode('latin-1', errors='ignore')
        else:
            print(f"Failed to fetch WARC record: HTTP {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching WARC record: {e}")
        return None

def extract_company_info(html, url, timestamp=''):
    """
    Extract company information from HTML content.
    Returns a dictionary with company name, industry, and other metadata.
    
    Args:
        html: The HTML content of the page
        url: The URL of the page
        timestamp: The timestamp of when the page was crawled
    """
    try:
        soup = BeautifulSoup(html, 'html.parser')
        
        title = soup.title.text.strip() if soup.title else ""
        company_name = title
        
        for suffix in [' - Home', ' | Home', ' - Official Site', ' - Australia']:
            if suffix in company_name:
                company_name = company_name.split(suffix)[0].strip()
                break
        
        description = ""
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if meta_desc and meta_desc.get('content'):
            description = meta_desc['content'].strip()
        
        industry_keywords = {
            'finance': ['bank', 'finance', 'investment', 'insurance', 'wealth', 'mortgage'],
            'retail': ['shop', 'store', 'retail', 'ecommerce', 'products', 'buy'],
            'technology': ['technology', 'software', 'IT', 'computing', 'digital', 'tech'],
            'healthcare': ['health', 'medical', 'hospital', 'clinic', 'care', 'patient'],
            'education': ['education', 'university', 'school', 'college', 'learn', 'training'],
            'government': ['government', 'council', 'department', 'agency', 'public'],
            'media': ['media', 'news', 'magazine', 'broadcast', 'publishing'],
            'tourism': ['travel', 'tourism', 'hotel', 'accommodation', 'vacation'],
        }
        
        all_text = soup.get_text().lower()
        detected_industry = None
        
        for ind, keywords in industry_keywords.items():
            if any(keyword.lower() in all_text for keyword in keywords):
                detected_industry = ind
                break
        
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        phone_pattern = r'\b(?:\+?61|0)[2-478](?:[ -]?[0-9]){8}\b' 
        
        emails = re.findall(email_pattern, html)
        phones = re.findall(phone_pattern, html)
        
        social_links = []
        social_patterns = ['facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com', 'youtube.com']
        
        for link in soup.find_all('a', href=True):
            href = link['href']
            if any(pattern in href for pattern in social_patterns):
                social_links.append(href)
        
        return {
            'url': url,
            'company_name': company_name,
            'title': title,
            'description': description,
            'industry': detected_industry,
            'emails': ';'.join(set(emails)),
            'phones': ';'.join(set(phones)),
            'social_links': ';'.join(set(social_links)),
            'snapshot_date': timestamp
        }
    except Exception as e:
        print(f"Error extracting company info from {url}: {e}")
        return {
            'url': url,
            'company_name': None,
            'title': None,
            'description': None,
            'industry': None,
            'emails': None,
            'phones': None,
            'social_links': None,
            'snapshot_date': timestamp
        }

def process_record(record):
    """Process a single record from the Common Crawl index"""
    url = record.get('url', '')
    timestamp = record.get('timestamp', '')
    html = fetch_warc_record(record)
    
    if html:
        return extract_company_info(html, url, timestamp)
    else:
        return None

def collect_australian_websites(total_limit=1000, per_domain_limit=100):
    """
    Collect information about Australian websites from Common Crawl.
    Returns a pandas DataFrame with the collected data.
    """
    all_results = []
    
    for domain_suffix in AU_DOMAIN_SUFFIXES:
        print(f"Searching for websites with domain suffix: {domain_suffix}")
        records = search_cc_index(domain_suffix, limit=per_domain_limit)
        
        if records:
            print(f"Found {len(records)} records for {domain_suffix}")
            
            with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
                results = list(tqdm.tqdm(
                    executor.map(process_record, records),
                    total=len(records),
                    desc=f"Processing {domain_suffix}"
                ))
                
                valid_results = [r for r in results if r and r.get('company_name')]
                all_results.extend(valid_results)
                
                print(f"Extracted information from {len(valid_results)} websites with {domain_suffix}")
                
                if len(all_results) >= total_limit:
                    break
        else:
            print(f"No records found for {domain_suffix}")
    
    if all_results:
        df = pd.DataFrame(all_results)
        df = df.drop_duplicates(subset=['url'])
        return df.head(total_limit)
    else:
        return pd.DataFrame()

def main():
    """Main function to run the Australian website data collection"""
    print(f"Starting Australian website data collection using Common Crawl index: {INDEX_NAME}")
    
    
    df = collect_australian_websites(total_limit=1000, per_domain_limit=200)
    
    if not df.empty:
        output_file = 'australian_websites.csv'
        df.to_csv(output_file, index=False)
        print(f"Successfully collected data for {len(df)} Australian websites")
        print(f"Data saved to {output_file}")
        
        if 'industry' in df.columns:
            industry_counts = df['industry'].value_counts()
            print("\nIndustry distribution:")
            for industry, count in industry_counts.items():
                if industry:
                    print(f"  {industry}: {count}")
    else:
        print("Failed to collect any data for Australian websites")

if __name__ == "__main__":
    main()

Starting Australian website data collection using Common Crawl index: CC-MAIN-2024-33
Searching for websites with domain suffix: .com.au
Found 200 records for .com.au


Processing .com.au: 100%|██████████| 200/200 [00:26<00:00,  7.69it/s]


Extracted information from 180 websites with .com.au
Searching for websites with domain suffix: .net.au
Found 200 records for .net.au


Processing .net.au: 100%|██████████| 200/200 [00:24<00:00,  8.16it/s]


Extracted information from 180 websites with .net.au
Searching for websites with domain suffix: .gov.au
Found 200 records for .gov.au


Processing .gov.au: 100%|██████████| 200/200 [00:29<00:00,  6.75it/s]


Extracted information from 199 websites with .gov.au
Searching for websites with domain suffix: .org.au
Found 200 records for .org.au


Processing .org.au: 100%|██████████| 200/200 [00:56<00:00,  3.55it/s]


Extracted information from 169 websites with .org.au
Searching for websites with domain suffix: .edu.au
Found 200 records for .edu.au


Processing .edu.au: 100%|██████████| 200/200 [00:57<00:00,  3.47it/s]

Extracted information from 187 websites with .edu.au
Successfully collected data for 911 Australian websites
Data saved to australian_websites.csv

Industry distribution:
  finance: 292
  technology: 290
  retail: 189
  education: 24
  healthcare: 1



