In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import re
import numpy as np
import tabulate
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

In [2]:
print("Fetching main faculty page...")
data = requests.get('https://www.iiserkol.ac.in/web/en/people/faculty/dbs/')
soup = BeautifulSoup(data.content, "html.parser")

# Collecting faculty profile URLs
urls = [link['href'] for link in soup.find_all('a')]
urls = [url for url in urls if '/web/en/people/faculty/dbs/' in url]
link = 'https://www.iiserkol.ac.in'
faculty_websites = list(set([link + url for url in urls]))
print(f"Found {len(faculty_websites)} faculty profile URLs.")

Fetching main faculty page...
Found 27 faculty profile URLs.


In [3]:
all_data = []
print("Scraping individual faculty pages...")

for faculty_url in faculty_websites:
    try:
        data = requests.get(faculty_url)
        data.raise_for_status()
        soup = BeautifulSoup(data.content, "html.parser")

        # Get faculty name
        name = soup.find_all('h3')
        faculty_name = [name.text for name in name]
        faculty_name = faculty_name[3].strip() if len(faculty_name) > 3 else None

        if not faculty_name:
            print(f"Skipping URL (could not find name): {faculty_url}")
            continue

        print(f"Processing: {faculty_name}")

        # Initialize profile dictionary
        profile_data = {
            "Name": faculty_name,
            "Positions": None,
            "Academic Background": None,
            "PhD": None,
            "PhD Year": None,
            "Research Interest": None,
            "Awards and Honors": None,
            "Number of awards": None,
        }

        # Get main profile content
        content = soup.find_all('div', class_='col-md-12 innerdiv')

        for section in content:
            section_text = section.get_text(separator="\n", strip=True)

            if 'Positions:' in section_text:
                profile_data["Positions"] = re.sub(r'Positions:\s+', '', section_text)

            elif 'Academic Background:' in section_text:
                profile_data["Academic Background"] = re.sub(r'Academic Background:\s+', '', section_text)

                # Try to extract PhD info from this section
                c = re.search(r"\d{4}", section_text) # Look for a 4-digit year
                if c:
                    x = c.start()
                    profile_data["PhD Year"] = section_text[x:x+4]
                    if 'PhD' in section_text or 'phd' in section_text or 'Ph.D' in section_text:
                        phd_text = section_text[21:x-1].strip(" ,")
                        profile_data["PhD"] = phd_text

            elif 'Research Interest:' in section_text:
                profile_data["Research Interest"] = re.sub(r'Research Interest:\s+', '', section_text)


            elif 'Awards and Honors:' in section_text:
                profile_data["Awards and Honors"] = re.sub(r'Awards and Honors:\s+', '', section_text)
                try:
                    award_count = (section_text.count('20') + section_text.count('19') -
                                   section_text.count('2020') - section_text.count('2019'))
                    profile_data["Number of awards"] = award_count
                except Exception:
                    profile_data["Number of awards"] = None # No awards found


        all_data.append(profile_data)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {faculty_url}: {e}")
        continue

print("Scraping complete.")

Scraping individual faculty pages...
Processing: Partha Pratim Datta
Processing: Anindita Bhadra
Processing: Supratim Datta
Processing: Sunil Kumar Khare
Processing: Punyasloke Bhadury
Processing: Sreeramaiah Gangappa
Processing: Sumit Sen Santara
Processing: Arnab Gupta
Processing: Radhika Venkatesan
Processing: Annagiri Sumana
Processing: Bidisha Sinha
Processing: Robert John Chandran
Processing: Partho Sarothi Ray
Processing: Tapas Kumar Sengupta
Processing: Neelanjana Sengupta
Processing: Amit Kumar Mandal
Processing: Mohit Prasad
Processing: Dipjyoti Das
Processing: Malancha Ta
Processing: Babu Sudhamalla
Processing: Rupak Datta
Processing: Jayasri Das Sarma
Processing: Anuradha Bhat
Processing: Rituparna Sinha Roy
Processing: Rahul Das
Processing: Sankar Maiti
Processing: Amirul Islam Mallick
Scraping complete.
