In [1]:
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

In [2]:
# Function to fetch and parse robots.txt
def parse_robots_txt(base_url):
    robots_url = urljoin(base_url, '/robots.txt')
    response = requests.get(robots_url)
    crawl_delay = {}
    disallowed_paths = {}

    if response.status_code == 200:
        lines = response.text.split('\n')
        user_agent = None
        for line in lines:
            line = line.strip()
            if line.startswith('User-agent:'):
                user_agent = line.split(':', 1)[1].strip().lower()
                if user_agent not in disallowed_paths:
                    disallowed_paths[user_agent] = []
            elif user_agent and line.startswith('Disallow:'):
                path = line.split(':', 1)[1].strip()
                if path:
                    disallowed_paths[user_agent].append(path)
            elif user_agent and line.startswith('Crawl-delay:'):
                delay = int(line.split(':', 1)[1].strip())
                crawl_delay[user_agent] = delay
    
    return crawl_delay, disallowed_paths

In [3]:
# Function to check if URL is allowed to be crawled
def is_url_allowed(url, user_agent, disallowed_paths):
    parsed_url = urlparse(url)
    for agent, paths in disallowed_paths.items():
        if agent == '*' or user_agent in agent:
            for path in paths:
                if parsed_url.path.startswith(path):
                    return False
    return True

In [4]:
# Function to fetch and parse a page
def fetch_page(url, user_agent, disallowed_paths):
    if not is_url_allowed(url, user_agent, disallowed_paths):
        print(f"Access to {url} is disallowed by robots.txt")
        return None
    
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        print(f"Failed to retrieve {url}")
        return None

In [5]:
# Main crawling function
def crawl_website(base_url, user_agent='my-crawler'):
    crawl_delay, disallowed_paths = parse_robots_txt(base_url)
    delay = crawl_delay.get(user_agent, crawl_delay.get('*', 0))

    urls_to_crawl = [base_url]
    crawled_urls = set()
    found_urls = {"title": [], "links": []}

    while urls_to_crawl:
        url = urls_to_crawl.pop(0)
        if url in crawled_urls:
            continue
        
        page_content = fetch_page(url, user_agent, disallowed_paths)
        if page_content:
            soup = BeautifulSoup(page_content, 'html.parser')
            if soup.title:
                found_urls["title"].append(soup.title.string)
            
            for link in soup.find_all('a', href=True):
                absolute_url = urljoin(base_url, link['href'])
                if base_url in absolute_url and absolute_url not in crawled_urls:
                    urls_to_crawl.append(absolute_url)
                    found_urls["links"].append(absolute_url)
        
        crawled_urls.add(url)
        time.sleep(delay)

    return found_urls['links']

In [6]:
base_url = 'https://labs.utsouthwestern.edu/ansir-lab'

df_new = pd.DataFrame()
df_new['links'] = crawl_website(base_url, user_agent='my-crawler')
df_new.drop_duplicates(inplace=True)

In [7]:
df = pd.read_csv('web_crawler_labweb_test_ShuaipengMa.csv')
df.drop('Unnamed: 0',axis=1, inplace=True)
df = pd.concat([df, df_new], ignore_index=True)
df.drop_duplicates(inplace=True)
#df.reset_index(inplace=True)
df.to_csv('web_crawler_labweb_test_ShuaipengMa.csv')