In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import re
import time

In [4]:
BASE_URL = "https://en.wikipedia.org"
URL = "https://en.wikipedia.org/wiki/List_of_airports_in_India"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0.0.0 Safari/537.36"
    )
}

EXPECTED_HEADERS = [
    "Area served",
    "Airport name",
    "IATA",
    "ICAO",
    "Airport type",
    "Operational",
    "Owned/operated by",
]

In [5]:
def resolve_url(href: str) -> str:
    if not href:
        return None
    if href.startswith("http"):
        return href
    return BASE_URL + href

def parse_airport_table(table):
    """
    Parse one <table> with the expected airport schema.
    Handles rowspan and extracts Airport URL.
    Returns list[dict] or None if header schema doesn't match exactly.
    """

    # ---------- 1. Get headers ----------
    thead = table.find("thead")
    if thead:
        header_cells = thead.find("tr").find_all("th")
        body_rows = table.find("tbody").find_all("tr") if table.find("tbody") else table.find_all("tr")[1:]
    else:
        # No <thead>: use first <tr> as header, skip it in body
        all_trs = table.find_all("tr")
        if not all_trs:
            return None

        header_cells = all_trs[0].find_all(["th", "td"])
        body_rows = all_trs[1:]

    headers = [h.get_text(strip=True) for h in header_cells]

    # exact schema check
    if headers != EXPECTED_HEADERS:
        return None

    # ---------- 2. Parse body with rowspan ----------
    results = []

    carried = [None] * len(headers)       # value to carry down
    rowspan_left = [0] * len(headers)     # how many rows left to carry

    for tr in body_rows:
        cells = tr.find_all(["td", "th"])
        if not cells:
            continue

        row = {}
        airport_url = None

        col_idx = 0
        cell_idx = 0

        while col_idx < len(headers):
            # existing rowspan fills this column
            if rowspan_left[col_idx] > 0:
                row[headers[col_idx]] = carried[col_idx]
                rowspan_left[col_idx] -= 1
                col_idx += 1
                continue

            if cell_idx >= len(cells):
                # no more cells in this row
                row[headers[col_idx]] = None
                col_idx += 1
                continue

            cell = cells[cell_idx]
            cell_idx += 1

            text = cell.get_text(strip=True) or None

            if headers[col_idx] == "Airport name":
                a_tag = cell.find("a", href=True)
                href = a_tag["href"] if a_tag else None
                airport_url = resolve_url(href)

            row[headers[col_idx]] = text

            rs = cell.get("rowspan")
            if rs:
                try:
                    span = int(rs)
                except ValueError:
                    span = 1
                if span > 1:
                    carried[col_idx] = text
                    rowspan_left[col_idx] = span - 1

            col_idx += 1

        # skip empty rows
        if all(v is None for v in row.values()):
            continue

        row["Airport URL"] = airport_url
        results.append(row)

    return results or None

In [6]:
resp = requests.get(URL, headers=HEADERS)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")

tables = soup.find_all("table", class_="wikitable")

all_rows = []
for tbl in tables:
    parsed = parse_airport_table(tbl)
    if parsed:
        all_rows.extend(parsed)

df = pd.DataFrame(all_rows)

# Put Airport URL next to Airport name
cols = df.columns.tolist()
if "Airport name" in cols and "Airport URL" in cols:
    cols.remove("Airport URL")
    idx = cols.index("Airport name") + 1
    cols.insert(idx, "Airport URL")
    df = df[cols]


In [7]:
df.head()

Unnamed: 0,Area served,Airport name,Airport URL,IATA,ICAO,Airport type,Operational,Owned/operated by
0,Donakonda,Donakonda Airport,https://en.wikipedia.org/wiki/Donakonda_Airport,,VODK,Domestic,No,AAI
1,Kadapa,Kadapa Airport,https://en.wikipedia.org/wiki/Kadapa_Airport,CDP,VOCP,Domestic,Yes,AAI
2,Kurnool,Uyyalawada Narasimha Reddy Airport,https://en.wikipedia.org/wiki/Kurnool_Airport,KJB,VOKU,Domestic,Yes,Government of Andhra Pradesh
3,Puttaparthi,Sri Sathya Sai Airport,https://en.wikipedia.org/wiki/Sri_Sathya_Sai_A...,PUT,VOPN,State/Private,Yes,Sri Sathya Sai Central Trust
4,Rajahmundry,Rajahmundry Airport,https://en.wikipedia.org/wiki/Rajahmundry_Airport,RJA,VORY,Domestic,Yes,AAI


In [8]:
def get_html(url: str) -> str:
    resp = requests.get(url, headers=HEADERS, timeout=20)
    resp.raise_for_status()
    return resp.text


def parse_wgs84_from_infobox(infobox: BeautifulSoup):
    """
    Try to extract WGS84 decimal lat/lon from the infobox coordinates block.
    Returns (lat, lon) as floats or (None, None).
    """
    # Typical pattern: a <span class="geo-dec"> "19.08861°N 72.86806°E"
    # or a hidden <span class="geo"> "19.08861; 72.86806"
    coord_cell = infobox.find("th", string=lambda s: s and "Coordinates" in s)
    if not coord_cell:
        return None, None

    td = coord_cell.find_next("td")
    if not td:
        return None, None

    # 1) Explicit decimal coordinates
    geo_dec = td.find("span", class_="geo-dec")
    if geo_dec:
        text = geo_dec.get_text(" ", strip=True)
        # Example: "19.08861°N 72.86806°E"
        # We'll pick numbers and signs based on N/S/E/W
        # pattern: "<lat>°N <lon>°E"  or similar
        # This is somewhat flexible:
        m = re.findall(r"([0-9.+-]+)\s*°?\s*([NnSsEeWw])?", text)
        if len(m) >= 2:
            # m[0] = (lat, N/S), m[1] = (lon, E/W)
            lat_str, lat_hem = m[0]
            lon_str, lon_hem = m[1]

            try:
                lat = float(lat_str)
                lon = float(lon_str)
                if lat_hem and lat_hem.upper() == "S":
                    lat = -lat
                if lon_hem and lon_hem.upper() == "W":
                    lon = -lon
                return lat, lon
            except ValueError:
                pass

    # 2) Hidden <span class="geo"> with "lat; lon"
    geo_hidden = td.find("span", class_="geo")
    if geo_hidden:
        text = geo_hidden.get_text(strip=True)
        # Example: "19.08861; 72.86806"
        parts = [p.strip() for p in text.split(";")]
        if len(parts) == 2:
            try:
                lat = float(parts[0])
                lon = float(parts[1])
                return lat, lon
            except ValueError:
                pass

    return None, None


def clean_infobox_value(cell: BeautifulSoup) -> str:
    """
    Extracts readable text from an infobox <td>, stripping refs and extra whitespace.
    """
    # Remove reference superscripts
    for sup in cell.find_all("sup", class_="reference"):
        sup.decompose()

    text = cell.get_text(" ", strip=True)
    return re.sub(r"\s+", " ", text).strip() if text else None


def parse_infobox_fields(infobox: BeautifulSoup):
    """
    From the main infobox, extract:
      - owner
      - operator
      - location
      - WGS84 lat/lon
    Returns dict with keys:
      'lat', 'lon', 'infobox_owner', 'infobox_operator', 'infobox_location'
    """
    lat, lon = parse_wgs84_from_infobox(infobox)

    owner = None
    operator = None
    location = None

    for row in infobox.find_all("tr"):
        th = row.find("th", class_="infobox-label")
        td = row.find("td", class_="infobox-data")
        # Some location cells use class="infobox-data label"
        if not th or not td:
            # check if location uses label class
            td_alt = row.find("td", class_="infobox-data label")
            if not td_alt:
                continue
            td = td_alt

        label = th.get_text(" ", strip=True)

        if label == "Owner":
            owner = clean_infobox_value(td)
        elif label == "Operator":
            operator = clean_infobox_value(td)
        elif label == "Location":
            location = clean_infobox_value(td)

    return {
        "lat": lat,
        "lon": lon,
        "infobox_owner": owner,
        "infobox_operator": operator,
        "infobox_location": location,
    }


def parse_statistics_from_infobox(infobox: BeautifulSoup):
    """
    Find infobox-subbox inside the infobox and extract:
      - Passengers
      - Aircraft movements
      - Cargo tonnage

    Returns dict with keys:
      'stats_passengers', 'stats_aircraft_movements', 'stats_cargo_tonnage'
    """
    stats_passengers = None
    stats_movements = None
    stats_cargo = None

    # An "infobox-subbox" table nested in the infobox
    subboxes = infobox.find_all("table", class_="infobox-subbox")
    for sub in subboxes:
        for tr in sub.find_all("tr"):
            th = tr.find("th", class_="infobox-label")
            td = tr.find("td", class_="infobox-data")
            if not th or not td:
                continue

            label = th.get_text(" ", strip=True)

            # Remove refs and "Increase" icons etc.
            value_text = clean_infobox_value(td)
            if not value_text:
                continue

            # Typically: "55,122,422 ( 4.4%)" -> keep 55,122,422
            main_value = re.split(r"\(", value_text, 1)[0].strip()
            # Keep only the numeric portion (but keep commas)
            main_value_numeric = re.findall(r"[0-9,]+", main_value)
            main_value_numeric = main_value_numeric[0] if main_value_numeric else None

            if label == "Passengers":
                stats_passengers = main_value_numeric
            elif label == "Aircraft movements":
                stats_movements = main_value_numeric
            elif label == "Cargo tonnage":
                stats_cargo = main_value_numeric

    return {
        "stats_passengers": stats_passengers,
        "stats_aircraft_movements": stats_movements,
        "stats_cargo_tonnage": stats_cargo,
    }


def scrape_airport_page(url: str) -> dict:
    """
    Given an airport Wikipedia URL, scrape:
      - lat, lon
      - owner, operator, location
      - passenger / movements / cargo stats (if present)
    Returns dict with all fields (may be None if missing).
    """
    try:
        html = get_html(url)
    except Exception as e:
        return {
            "lat": None,
            "lon": None,
            "infobox_owner": None,
            "infobox_operator": None,
            "infobox_location": None,
            "stats_passengers": None,
            "stats_aircraft_movements": None,
            "stats_cargo_tonnage": None,
            "error": str(e),
        }

    soup = BeautifulSoup(html, "lxml")

    infobox = soup.find("table", class_="infobox")
    if not infobox or "ib-airport" not in infobox.get("class", []):
        # not an airport infobox, return empty
        return {
            "lat": None,
            "lon": None,
            "infobox_owner": None,
            "infobox_operator": None,
            "infobox_location": None,
            "stats_passengers": None,
            "stats_aircraft_movements": None,
            "stats_cargo_tonnage": None,
            "error": "no airport infobox",
        }

    fields = parse_infobox_fields(infobox)
    stats = parse_statistics_from_infobox(infobox)
    fields.update(stats)
    fields["error"] = None
    return fields

In [9]:
# Example: df has columns ["Area served", "Airport name", "Airport URL", ...]
# We'll enrich it with infobox details

results = []
for i, row in tqdm(df.iterrows()):
    url = row["Airport URL"]
    if not isinstance(url, str) or not url.startswith("http"):
        results.append({
            "lat": None,
            "lon": None,
            "infobox_owner": None,
            "infobox_operator": None,
            "infobox_location": None,
            "stats_passengers": None,
            "stats_aircraft_movements": None,
            "stats_cargo_tonnage": None,
            "error": "no url",
        })
        continue

    info = scrape_airport_page(url)
    results.append(info)

    # Be polite to Wikipedia; small delay
    time.sleep(0.5)

info_df = pd.DataFrame(results)

# Merge back into original df
df_enriched = pd.concat([df.reset_index(drop=True), info_df], axis=1)

# Convert lat/lon to float explicitly (they should already be)
df_enriched["lat"] = pd.to_numeric(df_enriched["lat"], errors="coerce")
df_enriched["lon"] = pd.to_numeric(df_enriched["lon"], errors="coerce")

0it [00:00, ?it/s]

In [None]:
df_enriched.shape

(220, 17)

In [None]:
df_enriched.head()

Unnamed: 0,Area served,Airport name,Airport URL,IATA,ICAO,Airport type,Operational,Owned/operated by,lat,lon,infobox_owner,infobox_operator,infobox_location,stats_passengers,stats_aircraft_movements,stats_cargo_tonnage,error
0,Donakonda,Donakonda Airport,https://en.wikipedia.org/wiki/Donakonda_Airport,,VODK,Domestic,No,AAI,15.82444,79.48667,,Airports Authority of India,"Donakonda , Andhra Pradesh",,,,
1,Kadapa,Kadapa Airport,https://en.wikipedia.org/wiki/Kadapa_Airport,CDP,VOCP,Domestic,Yes,AAI,14.51,78.77278,Airports Authority of India,Airports Authority of India,"Kadapa, Andhra Pradesh , India",52745.0,1557.0,,
2,Kurnool,Uyyalawada Narasimha Reddy Airport,https://en.wikipedia.org/wiki/Kurnool_Airport,KJB,VOKU,Domestic,Yes,Government of Andhra Pradesh,15.70611,78.16083,Andhra Pradesh Airports Development Corporatio...,,"Orvakal , Kurnool district , Andhra Pradesh , ...",39746.0,1311.0,,
3,Puttaparthi,Sri Sathya Sai Airport,https://en.wikipedia.org/wiki/Sri_Sathya_Sai_A...,PUT,VOPN,State/Private,Yes,Sri Sathya Sai Central Trust,14.14917,77.79111,,,Puttaparthi,,,,
4,Rajahmundry,Rajahmundry Airport,https://en.wikipedia.org/wiki/Rajahmundry_Airport,RJA,VORY,Domestic,Yes,AAI,17.11028,81.81833,,Airports Authority of India,"Madhurapudi, Rajamahendravaram, Andhra Pradesh...",489114.0,8441.0,25.0,


In [None]:
df_enriched.to_csv("..\data\indian_airports.csv", index=False)