In [3]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse, quote
import urllib.request
import os
import pypdf
from collections import deque
import io
import re
from html.parser import HTMLParser


In [2]:


# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Define root domain to crawl
domain = "ashoka.edu.in"
full_url = "https://ashoka.edu.in/"

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:
            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []

            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

def pdf_to_text(pdf_content):
    # Create a PDF file object from the PDF content
    pdf_file = io.BytesIO(pdf_content)

    # Create a PDF reader object
    pdf_reader = pypdf.PdfReader(pdf_file)

    # Extract text from each page of the PDF
    text = ""
    i = 0 
    print(len(pdf_reader.pages))

    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[i].extract_text()
        i=i+1

    return text

def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists("text/"):
        os.mkdir("text/")

    if not os.path.exists("text/" + local_domain + "/"):
        os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
        os.mkdir("processed")

    # Define the header tags to ignore while extracting text
    header_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']

    # While the queue is not empty, continue crawling
    while queue:
        # Get the next URL from the queue
        url = queue.pop()
        print(url)  # for debugging and to see the progress

        response = requests.get(url)

        # Sanitize the URL to create a valid filename
        sanitized_url = quote(url, safe='')

        # Save text from the URL to a <url>.txt file
        with open('text/' + local_domain + '/' + sanitized_url + ".txt", "w", encoding="utf-8") as f:
            # Get the text from the URL using BeautifulSoup
            if response.headers.get('Content-Type') == ("application/pdf"):
                pdf_text = pdf_to_text(response.content)
                f.write(pdf_text)
            else:
                soup = BeautifulSoup(requests.get(url).text, "html.parser")

                # Remove header tags and anchor tags with titles from the soup object
                for tag in soup.find_all(header_tags):
                    tag.extract()
                for tag in soup.find_all('a', {'title': True}):
                    tag.extract()

                # Remove the text content within anchor tags but keep the anchor tags
                for tag in soup.find_all('a'):
                    tag.string = ''  # Set the text content within anchor tags to an empty string

                # Get the text from the soup object but preserve the text within span tags
                text = ""
                for tag in soup.find_all(string=True):
                    if tag.parent.name != 'span':
                        text += tag



                # Get the text but remove the remaining tags
                text = soup.get_text()

                # If the crawler gets to a page that requires JavaScript, it will stop the crawl
                if "You need to enable JavaScript to run this app." in text:
                    print("Unable to parse page " + url + " due to JavaScript being required")

                # Otherwise, write the text to the file in the text directory
                f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen:
                queue.append(link)
                seen.add(link)

crawl(full_url)



SyntaxError: incomplete input (2446128399.py, line 8)