In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import re
import numpy as np
import tabulate
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

In [None]:
print("Fetching main faculty page...")
data = requests.get('https://www.iiserkol.ac.in/web/en/people/faculty/dbs/')
soup = BeautifulSoup(data.content, "html.parser")

# Collecting faculty profile URLs
urls = [link['href'] for link in soup.find_all('a')]
urls = [url for url in urls if '/web/en/people/faculty/dbs/' in url]
link = 'https://www.iiserkol.ac.in'
faculty_websites = list(set([link + url for url in urls]))
print(f"Found {len(faculty_websites)} faculty profile URLs.")

Fetching main faculty page...
Found 27 faculty profile URLs.


In [None]:
all_data = []
print("Scraping individual faculty pages...")

for faculty_url in faculty_websites:
    try:
        data = requests.get(faculty_url)
        data.raise_for_status()
        soup = BeautifulSoup(data.content, "html.parser")

        # Get faculty name
        name = soup.find_all('h3')
        faculty_name = [name.text for name in name]
        faculty_name = faculty_name[3].strip() if len(faculty_name) > 3 else None

        if not faculty_name:
            print(f"Skipping URL (could not find name): {faculty_url}")
            continue

        print(f"Processing: {faculty_name}")

        # Initialize profile dictionary
        profile_data = {
            "Name": faculty_name,
            "Positions": None,
            "Academic Background": None,
            "PhD": None,
            "PhD Year": None,
            "Research Interest": None,
            "Awards and Honors": None,
            "Number of awards": None,
        }

        # Get main profile content
        content = soup.find_all('div', class_='col-md-12 innerdiv')

        for section in content:
            section_text = section.get_text(separator="\n", strip=True)

            if 'Positions:' in section_text:
                profile_data["Positions"] = re.sub(r'Positions:\s+', '', section_text)

            elif 'Academic Background:' in section_text:
                profile_data["Academic Background"] = re.sub(r'Academic Background:\s+', '', section_text)

                # Try to extract PhD info from this section
                c = re.search(r"\d{4}", section_text) # Look for a 4-digit year
                if c:
                    x = c.start()
                    profile_data["PhD Year"] = section_text[x:x+4]
                    if 'PhD' in section_text or 'phd' in section_text or 'Ph.D' in section_text:
                        phd_text = section_text[21:x-1].strip(" ,")
                        profile_data["PhD"] = phd_text

            elif 'Research Interest:' in section_text:
                profile_data["Research Interest"] = re.sub(r'Research Interest:\s+', '', section_text)


            elif 'Awards and Honors:' in section_text:
                profile_data["Awards and Honors"] = re.sub(r'Awards and Honors:\s+', '', section_text)
                try:
                    award_count = (section_text.count('20') + section_text.count('19') -
                                   section_text.count('2020') - section_text.count('2019'))
                    profile_data["Number of awards"] = award_count
                except Exception:
                    profile_data["Number of awards"] = None # No awards found


        all_data.append(profile_data)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {faculty_url}: {e}")
        continue

print("Scraping complete.")

Scraping individual faculty pages...
Processing: Partha Pratim Datta
Processing: Anindita Bhadra
Processing: Supratim Datta
Processing: Sunil Kumar Khare
Processing: Punyasloke Bhadury
Processing: Sreeramaiah Gangappa
Processing: Sumit Sen Santara
Processing: Arnab Gupta
Processing: Radhika Venkatesan
Processing: Annagiri Sumana
Processing: Bidisha Sinha
Processing: Robert John Chandran
Processing: Partho Sarothi Ray
Processing: Tapas Kumar Sengupta
Processing: Neelanjana Sengupta
Processing: Amit Kumar Mandal
Processing: Mohit Prasad
Processing: Dipjyoti Das
Processing: Malancha Ta
Processing: Babu Sudhamalla
Processing: Rupak Datta
Processing: Jayasri Das Sarma
Processing: Anuradha Bhat
Processing: Rituparna Sinha Roy
Processing: Rahul Das
Processing: Sankar Maiti
Processing: Amirul Islam Mallick
Scraping complete.


In [None]:
print("Connecting to database and saving data...")
conn = sqlite3.connect("faculty_data.db")
cursor = conn.cursor()

# Drop the table if it already exists
cursor.execute("DROP TABLE IF EXISTS Faculty")

# Create the new table
cursor.execute('''
CREATE TABLE Faculty (
    Name TEXT,
    Positions TEXT,
    Academic_Background TEXT,
    PhD TEXT,
    PhD_Year INTEGER,
    Research_Interest TEXT,
    Awards_and_Honors TEXT,
    Number_of_awards INTEGER
)
''')

# Insert data into the table
for profile in all_data:
    cursor.execute('''
    INSERT INTO Faculty (
        Name, Positions, Academic_Background, PhD, PhD_Year, Research_Interest,
        Awards_and_Honors, Number_of_awards
    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        profile["Name"],
        profile["Positions"],
        profile["Academic Background"],
        profile["PhD"],
        profile["PhD Year"],
        profile["Research Interest"],
        profile["Awards and Honors"],
        profile["Number of awards"]
    ))

# Commit changes
conn.commit()
print("Data saved to faculty_data.db")

Connecting to database and saving data...
Data saved to faculty_data.db


In [None]:
print("Loading data into pandas DataFrame...")
# Read data from SQL back into a pandas DataFrame
df = pd.read_sql_query("SELECT * FROM Faculty", conn)

# Close the database connection
# conn.close() # Removed this line to keep the connection open

# Define list of Indian locations
Indian_cities = [
    "Adilabad", "Adoni", "Agartala", "Agra", "Ahmedabad", "Ahmednagar", "Aizawl", "Ajmer", "Akola",
"Aligarh", "Allahabad", "Alwar", "Amravati", "Ambattur", "Amritsar", "Anand", "Anantapur",
"Andaman and Nicobar Islands", "Andhra Pradesh", "Arrah", "Arunachal Pradesh", "Asansol",
"Aurangabad", "Aurangabad (Bihar)", "Avadi", "Bally", "Balasore", "Bareilly", "Baramati",
"Baramulla", "Barasat", "Bardhaman", "Baripada", "Barmer", "Bathinda", "Begusarai", "Belgaum",
"Bellary", "Bengaluru", "Berhampur", "Bhagalpur", "Bhavnagar", "Bhilai", "Bhilwara", "Bhiwani",
"Bhopal", "Bhubaneswar", "Bhuj", "Bihar", "Bihar Sharif", "Bijapur", "Bilaspur", "Bilaspur (HP)",
"BITS Goa", "BITS Hyderabad", "BITS Pilani", "Bokaro", "Bombay", "Brahmapur", "Burdwan",
"Calcutta", "Chandigarh", "Chandrapur", "Chapra", "Chennai", "Chhattisgarh", "Chittoor", "Coimbatore",
"Cuddalore", "Cuttack", "Darbhanga", "Darjeeling", "Davanagere", "Dadra and Nagar Haveli and Daman and Diu",
"Dehradun", "Deoghar", "Delhi", "Dewas", "Dhanbad", "Dharwad", "Dhule", "Dibrugarh", "Dindigul",
"Diu", "Dumka", "Durg", "Durgapur", "Eluru", "Erode", "Faridabad", "Farrukhabad", "Firozabad",
"Gandhinagar", "Gaya", "Ghaziabad", "Giridih", "Goa", "Gopalganj", "Gorakhpur", "Gujarat",
"Gulbarga", "Guntur", "Guwahati", "Gwalior", "Hamirpur", "Haldia", "Haldwani", "Hapur", "Haridwar",
"Haryana", "Hazaribagh", "Himachal Pradesh", "Hindupur", "Hoshangabad", "Howrah", "Hubli-Dharwad",
"Hosur", "Hyderabad", "Ichalkaranji", "IIIT Allahabad", "IIIT Hyderabad", "IISc", "IISER Berhampur",
"IISER Bhopal", "IISER Kolkata", "IISER Mohali", "IISER Pune", "IISER Tirupati", "Imphal",
"Indore", "India", "Indian", "Jabalpur", "Jagdalpur", "Jadavpur", "Jaipur", "Jaisalmer", "Jalgaon",
"Jalna", "Jalpaiguri", "Jammu and Kashmir", "Jamnagar", "Jamui", "Jamshedpur", "Jhansi", "Jharkhand",
"Jhunjhunu", "JNU", "Jodhpur", "Junagadh", "Kadapa", "Kakinada", "Kalyan-Dombivli", "Kalyani",
"Kanchipuram", "Kanpur", "Kanyakumari", "Karaikal", "Karimnagar", "Karur", "Karnal", "Karnataka",
"Kathua", "Khammam", "Kharagpur", "Kochi", "Kolhapur", "Kolkata", "Kollam", "Korba", "Kota",
"Kothagudem", "Kozhikode", "Kulti", "Kumbakonam", "Kurnool", "Ladakh", "Lakshadweep", "Latur",
"Loni", "Lovely Professional University", "Lucknow", "Ludhiana", "Machilipatnam", "Madurai", "Madras",
"Mahbubnagar", "Maheshtala", "Malda", "Malegaon", "Mangalore", "Manipal", "Manipur", "Mandi",
"Mathura", "Mau", "Meerut", "Meghalaya", "Moradabad", "Morena", "Mumbai", "Muzaffarnagar",
"Muzaffarpur", "Mysuru", "Nagaland", "Nagpur", "Nainital", "Nanded", "Nashik", "Navi Mumbai",
"Nellore", "New Delhi", "NIT Calicut", "NIT Surathkal", "NIT Trichy", "NIT Warangal", "Nizamabad",
"Noida", "Odisha", "Ongole", "Ozhukarai", "Palakkad", "Panihati", "Panipat", "Parbhani", "Patiala",
"Patna", "Puducherry", "Pune", "Purnia", "Punjab", "Rajasthan", "Raiganj", "Raipur", "Rajahmundry",
"Rajkot", "Ranchi", "Rampur", "Ratlam", "Ratnagiri", "Rewa", "Rishikesh", "Ropar", "Roorkee",
"Rourkela", "Sagar", "Salem", "Samastipur", "Sambalpur", "Sangli", "Satara", "Satna", "Saharanpur",
"Shahjahanpur", "Shimla", "Shillong", "Shimoga", "Sikar", "Silchar", "Siliguri", "Siwan", "Solan",
"Solapur", "Sonipat", "South Dumdum", "Srikakulam", "Srinagar", "SRM Chennai", "Surat", "Tamil Nadu",
"Tanjavur", "Telangana", "Thanjavur", "Thiruvananthapuram", "Thrissur", "Tiruchirappalli", "Tirunelveli",
"Tirupati", "Tiruppur", "Tripura", "Tumkur", "Udaipur", "Ujjain", "Ulhasnagar", "Una", "Uttar Pradesh",
"Uttarakhand", "Vadodara", "Varanasi", "Vasai-Virar", "Vellore", "Vijayawada", "Visakhapatnam",
"Vizianagaram", "VIT Vellore", "Warangal", "Wardha", "West Bengal"

]

# Function to classify PhD institution
def institution(phd_string):
    if phd_string is None:
        return None
    for institute in Indian_cities:
        if institute in str(phd_string):
            return "India"
    return "Abroad"

# Apply the function to create a new column
df['PhD_Institution'] = df['PhD'].apply(institution)

# Create filtered DataFrames for further analysis
df_filtered = df[df['PhD_Institution'].notna()]
df_yrAw = df.dropna(subset=["Number_of_awards", "PhD_Year"])

# Prepare data for potential plotting or curve fitting
years = df_yrAw["PhD_Year"]
Aws = df_yrAw["Number_of_awards"]

print("Analysis complete. DataFrame 'df' is ready.")
print("\nDataFrame Head:")

# Just this on the last line, with NO print()
df


Loading data into pandas DataFrame...
Analysis complete. DataFrame 'df' is ready.

DataFrame Head:


Unnamed: 0,Name,Positions,Academic_Background,PhD,PhD_Year,Research_Interest,Awards_and_Honors,Number_of_awards,PhD_Institution
0,Partha Pratim Datta,"Professor, IISER Kolkata (current)\nAssoc...","PhD (Molecular Biology), IICB (Jadavpur Univer...","PhD (Molecular Biology), IICB (Jadavpur Univer...",2002,We are studying the molecular basis of Antimic...,Sir JC Bose Memorial Lecture\nfrom BCKV and DB...,3.0,India
1,Anindita Bhadra,"Professor, IISER Kolkata (current)\nEdito...","PhD (Animal Behaviour), Centre for Ecological ...","PhD (Animal Behaviour), Centre for Ecological ...",2008,I work on the behavioural ecology of the India...,Editorial Board member\nfrom Animal Behavior a...,24.0,India
2,Supratim Datta,"Professor, IISER Kolkata (current)\nAssoc...","Ph.D. (Chemistry), Boston University, Boston, ...","Ph.D. (Chemistry), Boston University, Boston",2005,The research in my laboratory focuses on engin...,Ramanujan Fellowship\nfrom Government of India...,2.0,Abroad
3,Sunil Kumar Khare,"Professor, IISER Kolkata (current)\nProfe...","PhD (Biochemistry), IIT Delhi, 1990","PhD (Biochemistry), IIT Delhi",1990,"Extremophiles and their Enzymes, Antimicrobial...","President, Association of Microbiologists of I...",10.0,India
4,Punyasloke Bhadury,"Professor, IISER Kolkata (current)\nScien...","Ph.D. (Biological Sciences), Plymouth Marine L...","Ph.D. (Biological Sciences), Plymouth Marine L...",2005,We study the biological complexity and intrins...,Associate Editor\nfrom Ecological Solutions an...,10.0,Abroad
5,Sreeramaiah Gangappa,"Assistant Professor, IISER Kolkata ( - )\nMari...","PhD (Plant Molecular Biology), NIPGR (JNU, New...","PhD (Plant Molecular Biology), NIPGR (JNU, New...",2009,"The ability to perceive, integrate, and proces...",Ramalingaswami Re-entry Felloship\nfrom Depart...,2.0,India
6,Sumit Sen Santara,"Assistant Professor, IISER Kolkata (curre...","Ph.D. (Biochemistry), Indian Institute of chem...","Ph.D. (Biochemistry), Indian Institute of chem...",2014,Innate immune cells evolve early and are prese...,INSA Young Scientist Award\nfrom Indian Nation...,5.0,India
7,Arnab Gupta,"Associate Professor, IISER Kolkata (curre...","PhD (Human Genetics), CSIR-Indian Inst of Chem...","PhD (Human Genetics), CSIR-Indian Inst of Chem...",2007,"Cell biology, membrane trafficking, eukaryotic...",,,India
8,Radhika Venkatesan,"Associate Professor, IISER Kolkata (curre...","PhD (Chemical Ecology), Max Planck Institute f...","PhD (Chemical Ecology), Max Planck Institute f...",2010,My lab primarily works on plant-insect interac...,Early Career Award\nfrom SERB (2016)\nRamanuja...,2.0,Abroad
9,Annagiri Sumana,"Professor, IISER Kolkata ( - )\nChairman - Dep...","PhD (Dominance Hierarchy in a social wasp), In...","PhD (Dominance Hierarchy in a social wasp), In...",2002,EvolutionEcologyAnimal Behaviour,Emerging nations award\nfrom Animal Behaviour ...,4.0,India
