In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse
import time

def scrape_jobs(urls, keywords):
    results = []
    
    for url in urls:
        try:
            # Add delay to be respectful to servers
            time.sleep(2)
            
            # Get the domain name for reporting
            domain = urlparse(url).netloc
            
            # Fetch the webpage
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            
            # Parse the content
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Get all text content
            text_content = soup.get_text().lower()
            print(f"Scraping {url} and {domain}...")
            print(soup.prettify()[:500])  # Print first 500 characters of the prettified HTML
            
            # Check for keywords
            for keyword in keywords:
                keyword = keyword.lower()
                if keyword in text_content:
                    count = text_content.count(keyword)
                    results.append({
                        'Website': domain,
                        'Keyword': keyword,
                        'Count': count,
                        'URL': url
                    })
                    
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
    
    # Create DataFrame from results
    df = pd.DataFrame(results)
    return df if not df.empty else "No matches found"

# Example usage:
urls = [
#     'https://www.jobs.ch/en/vacancies/?term=bioinformatics',
    'https://www.novartis.com/careers/career-search?search_api_fulltext=computational+biology&country%5B%5D=LOC_CH&field_job_posted_date=All&op=Submit',
    'https://careers.roche.com/global/en/search-results?keywords=%22computational%20biology%22'
]

keywords = ['python', 'data science', 'bioinformatics', 'machine learning', 'computational biology']
results = scrape_jobs(urls, keywords)
print(results)

Scraping https://www.jobs.ch/en/vacancies/?term=bioinformatics and www.jobs.ch...
<!DOCTYPE html>
<html class="legacy" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport">
   <!-- No cache for index.html -->
   <meta content="no-cache" http-equiv="Pragma"/>
   <meta content="no-cache, must-revalidate" http-equiv="cache-control"/>
   <title>
    34 Bioinformatics jobs - jobs.ch
   </title>
   <!-- Preconnects : only first party origin and used everywhere! -->
   <link href="//c.jobs.ch" rel="preconnect"/>
   <!--
Scraping https://www.novartis.com/careers/career-search?search_api_fulltext=computational+biology&country%5B%5D=LOC_CH&field_job_posted_date=All&op=Submit and www.novartis.com...
<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  sche