In [None]:
# https://github.com/adbar/trafilatura
# https://adrien.barbaresi.eu/blog/using-sitemaps-crawl-websites.html
# https://trafilatura.readthedocs.io/en/latest/

In [1]:
import trafilatura
import json

In [None]:
# find the links using the sitemap
# https://1001suns.com/robots.txt
# https://1001suns.com/wp-sitemap.xml
# https://1001suns.com/sitemap.xml
# https://trafilatura.readthedocs.io/en/latest/usage-python.html
from trafilatura import sitemaps

url = 'https://1001suns.com'
url = 'https://1001suns.com/sitemap_post'
#url = 'https://karlsruhe.digital'

print("url", url)
links = sitemaps.sitemap_search(url)
if False:
    for link in links:
        print(len(links), "sitemap_search:", link)
        links.extend(sitemaps.sitemap_search(link)) # stays the same
        links = sorted(list(set(links))) # make unique
    print("links:", len(links))

outfile = "trafilatura_links_long_" + url.replace("https://", "").replace("http://", "").replace("/", "_") + ".txt"
with open(outfile, 'w') as f:
    for link in links:
        f.write(link + "\n")
        print(link)


In [None]:
myconfig = trafilatura.settings.use_config('trafilatura-settings.cfg')
#print(myconfig)

In [None]:
# https://programtalk.com/python-more-examples/trafilatura.sitemaps.handle_link/
trafilatura.sitemaps.handle_link

In [None]:
url = 'https://1001suns.com/'

to_visit, known_urls = trafilatura.spider.focused_crawler(
    url, 
    max_seen_urls=10, max_known_urls=100000,
    config=myconfig,
)
print("to_visit:", to_visit)
print("known_urls:", sorted(known_urls))

# to_visit, known_urls = focused_crawler(homepage, max_seen_urls=10, max_known_urls=100000, todo=to_visit, known_links=known_urls)

In [None]:
downloaded = trafilatura.fetch_url(url)
# print(downloaded) # html
j = trafilatura.extract( 
    downloaded, 
    output_format='json',
    # include_formatting=True,
    # include_links=True,
    include_comments=False, include_tables=False, no_fallback=True,
    #favor_precision=True
) # text only
print(json.dumps(json.loads(j), indent=4))

trafilatura.bare_extraction(downloaded)



In [None]:
# https://www.thepythoncode.com/article/extract-all-website-links-python
# https://www.thepythoncode.com/code/extract-all-website-links-python
# pip3 install requests bs4 colorama

import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama

# init the colorama module
colorama.init()

GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW

# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()

total_urls_visited = 0

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
main_url = "https://1001suns.com/sitemap_post/"

def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url,headers=headers).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue
        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_valid(href):
            # not a valid URL
            continue
        if href in internal_urls:
            # already in the set
            continue
        if domain_name not in href:
            # external link
            if href not in external_urls:
                print(f"{GRAY}[!] External link: {href}{RESET}")
                external_urls.add(href)
            continue
        print(f"{GREEN}[*] Internal link: {href}{RESET}")
        urls.add(href)
        internal_urls.add(href)
    return urls


def crawl(url, max_urls=30):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 30.
    """
    global total_urls_visited
    total_urls_visited += 1
    print(f"{YELLOW}[*] Crawling: {url}{RESET}")
    links = get_all_website_links(url)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)


if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
    parser.add_argument("--url",        help="The URL to extract links from.",  default=main_url,   type=str)
    parser.add_argument("--max-urls",   help="Number of max URLs to crawl.",    default=1000,        type=int)
    
    #args = parser.parse_args()
    args, _unknown_args = parser.parse_known_args()
    url = args.url
    max_urls = args.max_urls
    print("url     :", url)
    print("max_urls:", max_urls)

    crawl(url, max_urls=max_urls)

    print("[+] Total Internal links:", len(internal_urls))
    print("[+] Total External links:", len(external_urls))
    print("[+] Total URLs:", len(external_urls) + len(internal_urls))
    print("[+] Total crawled URLs:", max_urls)

    domain_name = urlparse(url).netloc
    print("domain_name:", domain_name)

    # save the internal links to a file
    print("save the internal links to a file...")
    with open(f"{domain_name}_internal_links.txt", "w") as f:
        for internal_link in internal_urls:
            print(internal_link.strip(), file=f)

    # save the external links to a file
    print("save the external links to a file...")
    with open(f"{domain_name}_external_links.txt", "w") as f:
        for external_link in external_urls:
            print(external_link.strip(), file=f)
            
    print("all done.")