In [7]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import marko
import marko.patterns
import time

In [8]:
OUTPUT_DIR = Path("./data")
JINA_PREFIX = "https://r.jina.ai/"
LINKS = dict(domain=set(), external=set())

In [9]:
def getRequest(link: str) -> str:
    response = requests.get(link)
    return response.text


def getWithJina(link: str) -> str:
    appendedLink = JINA_PREFIX + link
    return getRequest(appendedLink)


def getSoup(link: str):
    response = getRequest(link)
    soup = BeautifulSoup(response, "html.parser")
    return soup

In [4]:
# pageName = "Contact"
# response = getWithJina("https://www.ccet.ac.in/degreeCourse.php")
# with open(OUTPUT_DIR / f"{pageName}.md", mode="w") as file:
#     file.write(response)
# print(len(response))

In [11]:
from marko.inline import Link

def getAllPageLinks(response) -> set:
    markoparsed = marko.parse(response)
    domainLinks = set()
    externalLinks = set()

    def walkElementsOfMD(element):
        if isinstance(element, Link):
            if "ccet.ac.in" in element.dest:
                domainLinks.add(element.dest)
            else:
                externalLinks.add(element.dest)
        for child in getattr(element, "children", []):
            walkElementsOfMD(child)

    walkElementsOfMD(markoparsed)
    return domainLinks, externalLinks

In [6]:
# home = getWithJina("https://www.ccet.ac.in")
# domain, external = getAllPageLinks(home)

# doneLinks = {"https://www.ccet.ac.in"}
# LINKS["domain"].update(domain)
# LINKS["external"].update(external)

# QUEUE = set(domain)
# pageCounter = 1
# mediaExtensions = (
#     ".jpg",
#     ".jpeg",
#     ".png",
#     ".gif",
#     ".pdf",
#     ".mp4",
#     ".mp3",
#     ".avi",
#     ".mov",
#     ".wav",
# )

# while len(QUEUE) != 0:
#     currLink = QUEUE.pop()

#     if currLink not in doneLinks:
#         doneLinks.add(currLink)
#         if currLink.lower().endswith(mediaExtensions):
#             print(f"Skipping media link: {currLink}")
#         else:
#             try:
#                 currResponse = getWithJina(currLink)
#                 with open(
#                     OUTPUT_DIR / f"{pageCounter}.md", mode="w", encoding="utf-8"
#                 ) as file:
#                     file.write(currResponse)
#                 newDomain, newExternal = getAllPageLinks(currResponse)
#                 onlyNewLinks = newDomain.difference(LINKS["domain"])
#                 LINKS["domain"].update(onlyNewLinks)
#                 LINKS["external"].update(newExternal)
#                 QUEUE.update(onlyNewLinks)

#                 print(
#                     f"Progress: {pageCounter}/{len(LINKS['domain'])}\t Queue: {len(QUEUE)}"
#                 )
#                 pageCounter += 1
#             except print(0):
#                 pass
#     time.sleep(1.5)

Progress: 1/128	 Queue: 127
Skipping media link: https://www.ccet.ac.in/pdf/notices/exams/ExaminationNoticeforB.E.StudentsJuly-Dec2024.pdf
Skipping media link: https://www.ccet.ac.in/pdf/notices/exams/UIET7th-InstructionsforStudents.pdf
Skipping media link: https://www.ccet.ac.in/pdf/notices/exams/4THSEMDATESHEET.pdf
Progress: 2/130	 Queue: 125
Progress: 3/139	 Queue: 133
Progress: 4/139	 Queue: 132
Progress: 5/148	 Queue: 140
Progress: 6/150	 Queue: 141
Progress: 7/150	 Queue: 140
Progress: 8/171	 Queue: 160
Skipping media link: https://www.ccet.ac.in/pdf/notices/general/BloodDonationCamp2024.pdf
Progress: 9/171	 Queue: 158
Skipping media link: https://www.ccet.ac.in/pdf/notices/admissions/Academic-helpdesk-2021.jpg
Progress: 10/173	 Queue: 158
Skipping media link: https://www.ccet.ac.in/pdf/notices/general/Updateddatesheet3rdsemesterMechanical.pdf
Progress: 11/175	 Queue: 158
Skipping media link: https://www.ccet.ac.in/pdf/ProformaComputational.pdf
Skipping media link: https://www.cc

In [12]:
home = getWithJina("https://www.ccet.ac.in")
domain, external = getAllPageLinks(home)

doneLinks = {"https://www.ccet.ac.in"}
LINKS["domain"].update(domain)
LINKS["external"].update(external)

QUEUE = set(domain)
pageCounter = 1
mediaExtensions = (
    ".jpg",
    ".jpeg",
    ".png",
    ".gif",
    ".pdf",
    ".mp4",
    ".mp3",
    ".avi",
    ".mov",
    ".wav",
)

while len(QUEUE) != 0:
    currLink = QUEUE.pop()

    if currLink not in doneLinks:
        doneLinks.add(currLink)
        if currLink.lower().endswith(mediaExtensions):
            print(f"Skipping media link: {currLink}")
        else:
            try:
                currResponse = getWithJina(currLink)
                # with open(
                #     OUTPUT_DIR / f"{pageCounter}.md", mode="w", encoding="utf-8"
                # ) as file:
                #     file.write(currResponse)
                newDomain, newExternal = getAllPageLinks(currResponse)
                onlyNewLinks = newDomain.difference(LINKS["domain"])
                LINKS["domain"].update(onlyNewLinks)
                LINKS["external"].update(newExternal)
                QUEUE.update(onlyNewLinks)
                print(
                    f"Progress: {pageCounter}/{len(LINKS['domain'])}\t Queue: {len(QUEUE)}"
                )
                pageCounter += 1
            except print(0):
                pass
    time.sleep(1.5)

Progress: 1/128	 Queue: 127
Skipping media link: https://www.ccet.ac.in/pdf/ENewsLetter/NewsletterVolVIssueII.pdf
Progress: 2/130	 Queue: 127
Skipping media link: https://www.ccet.ac.in/pdf/ENewsLetter/Newsletter-VolVIIIIssueI.pdf
Progress: 3/133	 Queue: 128
Progress: 4/136	 Queue: 130
Progress: 5/136	 Queue: 129
Progress: 6/146	 Queue: 138
Skipping media link: https://www.ccet.ac.in/pdf/notices/general/Consonance2024.pdf
Skipping media link: https://www.ccet.ac.in/pdf/notices/general/WebsiteCompetitionReport01-10-2024.pdf
Skipping media link: https://www.ccet.ac.in/pdf/notices/exams/InstructionsforUIETANDCCET.pdf
Skipping media link: https://www.ccet.ac.in/pdf/notices/general/BloodDonationCampReport2024.pdf
Skipping media link: https://www.ccet.ac.in/pdf/notices/exams/6THSEMDATESHEET.pdf
Progress: 7/157	 Queue: 143
Skipping media link: https://www.ccet.ac.in/pdf/notices/general/Updateddatesheet7thsemesterMechanical.pdf
Progress: 8/161	 Queue: 145
Progress: 9/169	 Queue: 152
Progress: 

In [13]:
import pickle

with open("./json/pickles/links.pkl", mode="wb") as pkllinkfile:
    pickle.dump(LINKS, pkllinkfile)

In [20]:
LINKS["external"]

{'http://192.168.13.19/moodle/',
 'http://besucherzaehler.co/',
 'http://ccet.acm.org/',
 'http://chandigarh.gov.in/',
 'http://crikc.puchd.ac.in/',
 'http://eakadamik.in/ccet/',
 'http://mhrd.gov.in/',
 'http://mhrdnats.gov.in/',
 'http://nkn.in/',
 'http://nptel.ac.in/',
 'http://op.niscair.res.in/',
 'http://phdadmissions.puchd.ac.in/',
 'http://puchd.ac.in/',
 'http://puleet.puchd.ac.in/',
 'http://puleet.puchd.ac.in/importantdates.php',
 'http://www.aicte-india.org/',
 'http://www.dst.gov.in/',
 'http://www.upsc.gov.in/',
 'https://ccet.acm.org/',
 'https://ccetchdlibrary.weebly.com/',
 'https://cdnbbsr.s3waas.gov.in/s3dd28e50635038e9cf3a648c2dd17ad0a/uploads/2022/08/2022082993.pdf',
 'https://drdo.gov.in/',
 'https://edu.google.com/teacher-center/products/classroom/?modal_active=none',
 'https://exams.puchd.ac.in/includes/noticeboard/2021/20210129172048-onlineexams-february2021.pdf?2021',
 'https://fitindia.gov.in/n',
 'https://forms.gle/VvhJ1qLziKtwEgJ57',
 'https://forms.gle/XZ