In [1]:
from bs4 import BeautifulSoup
import import_ipynb
import requests
import os
import re
from tqdm import tqdm
from image_data_extract import call_func as image_extract

def sanitize_folder_name(name):
    name = re.sub(r'[\r\n]+', ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    name = re.sub(r'[<>:"/\\|?*]', '', name)
    return name

def creating_required_directories(clg_dir_name, output_dir):
    clg_dir_path = os.path.join(output_dir, clg_dir_name)
    os.makedirs(clg_dir_path, exist_ok=True)
    return os.path.join(clg_dir_path, 'parms.png')

def extract_clg_name(text_string):
    return text_string.split('|')[0].split('More')[0].strip()

def get_image_url(html_tag):
    try:
        for a in html_tag.find_all("a"):
            href = a.get('href')
            if href and '/graph/' in href and href.lower().endswith(('.jpg', '.jpeg', '.png')):
                return href
        return None
    except Exception:
        return None

def download_file(url, file_path):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(file_path, 'wb') as file:
                file.write(response.content)
            return True
        return False
    except:
        return False

def main(year):
    output_dir = f'output{year}'
    os.makedirs(output_dir, exist_ok=True)

    url = f'https://www.nirfindia.org/Rankings/{year}/EngineeringRanking.html'
    print(f"\nProcessing year {year}: {url}")

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find(id="tbl_overall")
        if not table:
            print("No ranking table found for this year.")
            return

        college_tag_list = table.find_all("tr")[1::3]
        for college in tqdm(college_tag_list, desc=f"Year {year}"):
            children = [c for c in list(college.children) if c.name is not None]
            raw_name = children[1].text
            clg_name = extract_clg_name(raw_name)
            clg_dir_name = sanitize_folder_name(clg_name)
            image_file_path = creating_required_directories(clg_dir_name, output_dir)
            image_url = get_image_url(children[1])

            if os.path.exists(image_file_path):
                continue

            if not download_file(image_url, image_file_path):
                print(f"Skipping {clg_name}: Failed to download image.")

            if not image_url:
                continue

    except Exception as e:
        print(f"Error occurred for year {year}: {e}")

if __name__ == '__main__':
    years = [2018,2019,2021,2022,2023,2024,2025]
    for year in years:
        main(year)
        image_extract(year)



Processing year 2018: https://www.nirfindia.org/Rankings/2018/EngineeringRanking.html


Year 2018: 100%|██████████| 100/100 [00:00<00:00, 1956.12it/s]


Skipping Indian Institute of Engineering Science and Technology, Shibpur: Failed to download image.
Skipping Indian Institute of Technology Mandi: Failed to download image.
⚠️ Skipping Indian Institute of Engineering Science and Technology, Shibpur: No image found.
⚠️ Skipping Indian Institute of Technology Mandi: No image found.
Processed for year 2018


Processing year 2019: https://www.nirfindia.org/Rankings/2019/EngineeringRanking.html


Year 2019: 100%|██████████| 200/200 [00:00<00:00, 1584.51it/s]


❌ OCR failed for Jain university
❌ OCR failed for Karunya Institute of Technology and Sciences
Processed for year 2019


Processing year 2021: https://www.nirfindia.org/Rankings/2021/EngineeringRanking.html


Year 2021: 100%|██████████| 200/200 [00:00<00:00, 3383.02it/s]


Processed for year 2021


Processing year 2022: https://www.nirfindia.org/Rankings/2022/EngineeringRanking.html


Year 2022: 100%|██████████| 200/200 [00:00<00:00, 2277.59it/s]


Processed for year 2022


Processing year 2023: https://www.nirfindia.org/Rankings/2023/EngineeringRanking.html


Year 2023: 100%|██████████| 100/100 [00:00<00:00, 2476.36it/s]


Processed for year 2023


Processing year 2024: https://www.nirfindia.org/Rankings/2024/EngineeringRanking.html


Year 2024: 100%|██████████| 100/100 [00:00<00:00, 3473.08it/s]


Processed for year 2024


Processing year 2025: https://www.nirfindia.org/Rankings/2025/EngineeringRanking.html


Year 2025: 100%|██████████| 100/100 [00:25<00:00,  3.87it/s]


Processed for year 2025

