<a href="https://colab.research.google.com/github/kavyajeetbora/foursquare_ai/blob/master/notebooks/15.railways.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Here is the link to extract more information on railway stations:

https://www.railyatri.in/stations?name=L&page=10

In [None]:
import duckdb
import requests
import pandas as pd
from bs4 import BeautifulSoup
from typing import Optional
from tqdm.notebook import tqdm
import time
import string

## Extract The Railway Station Locations

In [None]:
%%shell

wget https://download.geofabrik.de/asia/india-latest.osm.pbf -O india-latest.osm.pbf

In [None]:
con = duckdb.connect()

con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")

query = f"""
SELECT
    id,
    unnest(map_extract(tags, 'name'))              AS name,
    unnest(map_extract(tags, 'railway'))           AS railway,
    unnest(map_extract(tags, 'public_transport'))  AS public_transport,
    unnest(map_extract(tags, 'ref')) AS station_code,
    unnest(map_extract(tags, 'network')) AS network,
    unnest(map_extract(tags, 'internet_access')) AS internet_access,
    lat,
    lon
FROM ST_READOSM('india-latest.osm.pbf')
WHERE kind = 'node'
AND 'station' IN map_extract(tags, 'railway')
AND 'station' IN map_extract(tags, 'public_transport')
"""

df = con.execute(query).df()
con.close()

In [None]:
df.head()

In [None]:
df.shape

## Extract the more info on Railway Stations

In [None]:
letters = list(string.ascii_uppercase)


BASE_URL = "https://www.railyatri.in/stations"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
    )
}


def _get_soup(letter: str, page: int) -> BeautifulSoup:
    """Fetch a page and return BeautifulSoup object."""
    params = {"name": letter, "page": page}
    resp = requests.get(BASE_URL, params=params, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")


def fetch_station_table(letter: str, page: int) -> pd.DataFrame:
    """
    Fetch the stations table for a given starting letter and page number.

    Returns a DataFrame with:
    ['Station Code', 'Station Name', 'District', 'State', 'Trains passing through']
    """
    soup = _get_soup(letter, page)

    container = soup.find("div", class_="stationTable")
    if container is None:
        # No table for this letter/page
        return pd.DataFrame(
            columns=[
                "Station Code",
                "Station Name",
                "District",
                "State",
                "Trains passing through",
            ]
        )

    table = container.find("table", class_="stn-dir-list-tbl")
    if table is None:
        return pd.DataFrame(
            columns=[
                "Station Code",
                "Station Name",
                "District",
                "State",
                "Trains passing through",
            ]
        )

    # Headers
    header_row = table.find("thead").find("tr")
    headers = [th.get_text(strip=True) for th in header_row.find_all("th")]

    # Rows
    data_rows = []
    tbody = table.find("tbody")
    if not tbody:
        return pd.DataFrame(columns=headers)

    for tr in tbody.find_all("tr"):
        cols = tr.find_all("td")
        if not cols:
            continue

        station_code = cols[0].get_text(strip=True)

        station_name_cell = cols[1]
        station_name_link = station_name_cell.find("a")
        station_name = (
            station_name_link.get_text(strip=True)
            if station_name_link
            else station_name_cell.get_text(strip=True)
        )

        district = cols[2].get_text(strip=True)
        state = cols[3].get_text(strip=True)
        trains_passing = cols[4].get_text(strip=True)

        data_rows.append(
            [station_code, station_name, district, state, trains_passing]
        )

    return pd.DataFrame(data_rows, columns=headers)


def _get_last_page_from_soup(soup: BeautifulSoup) -> int:
    """
    Parse the pagination block and return the last page number.
    If no pagination or no pages, returns 1.
    """
    pag_div = soup.find("div", class_="pagination")
    if not pag_div:
        # No pagination, likely only 1 page (or no results)
        return 1

    # All links in pagination
    page_numbers = []
    for a in pag_div.find_all("a"):
        text = a.get_text(strip=True)
        if text.isdigit():
            page_numbers.append(int(text))

    if not page_numbers:
        # Something odd, but fallback to 1
        return 1

    return max(page_numbers)


def fetch_all_stations_for_letter(letter: str) -> pd.DataFrame:
    """
    Fetch the full stations table for all pages for a given starting letter.
    Automatically determines how many pages exist.
    """
    # First, fetch page 1 to determine number of pages
    soup_page1 = _get_soup(letter, 1)

    # If there's no table at all for this letter, return empty
    container = soup_page1.find("div", class_="stationTable")
    if container is None:
        return pd.DataFrame(
            columns=[
                "Station Code",
                "Station Name",
                "District",
                "State",
                "Trains passing through",
            ]
        )

    last_page = _get_last_page_from_soup(soup_page1)

    # Fetch page 1 table from the soup we already have
    # (avoid double request for page 1)
    table_page1 = fetch_station_table(letter, 1)

    dfs = [table_page1]

    # If only 1 page, we are done
    if last_page == 1:
        return table_page1.reset_index(drop=True)

    # Fetch remaining pages
    for page in range(2, last_page + 1):
        df_page = fetch_station_table(letter, page)
        dfs.append(df_page)

    full_df = pd.concat(dfs, ignore_index=True)
    return full_df

In [None]:
import time
import random
import requests

def get_with_backoff(url, params=None, headers=None,
                     max_retries=5,
                     base_sleep=2.0,
                     max_sleep=30.0):
    """
    GET with simple exponential backoff for HTTP 429.
    Returns a Response or raises the last error.
    """
    attempt = 0
    while True:
        try:
            resp = requests.get(url, params=params, headers=headers, timeout=30)
            if resp.status_code == 429:
                # Too many requests: sleep and retry
                attempt += 1
                if attempt > max_retries:
                    resp.raise_for_status()
                # Check if server sends Retry-After
                retry_after = resp.headers.get("Retry-After")
                if retry_after is not None:
                    sleep_time = float(retry_after)
                else:
                    sleep_time = min(max_sleep, base_sleep * (2 ** (attempt - 1)))
                # optional jitter
                sleep_time += random.uniform(0, 1.0)
                print(f"429 received. Sleeping {sleep_time:.1f}s before retry {attempt}/{max_retries}...")
                time.sleep(sleep_time)
                continue

            resp.raise_for_status()
            return resp

        except requests.RequestException as e:
            attempt += 1
            if attempt > max_retries:
                raise
            sleep_time = min(max_sleep, base_sleep * (2 ** (attempt - 1)))
            sleep_time += random.uniform(0, 1.0)
            print(f"Request error: {e}. Sleeping {sleep_time:.1f}s before retry {attempt}/{max_retries}...")
            time.sleep(sleep_time)


# Then plug this into your existing code, e.g. inside _get_soup:

BASE_URL = "https://www.railyatri.in/stations"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
    )
}

def _get_soup(letter: str, page: int):
    params = {"name": letter, "page": page}
    resp = get_with_backoff(BASE_URL, params=params, headers=HEADERS)
    from bs4 import BeautifulSoup
    return BeautifulSoup(resp.text, "html.parser")

In [None]:
from tqdm.autonotebook import tqdm
import time
import random

dfs = []
for letter in tqdm(
    letters,
    desc="Scraping station lists",
    unit="letter",
    dynamic_ncols=True,
    colour="cyan",
    bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
):
    df_all = fetch_all_stations_for_letter(letter)
    if df_all.shape[0] > 0:
        dfs.append(df_all)
    # polite pause between letters
    time.sleep(1.0 + random.uniform(0, 0.5))

In [None]:
railway_df = pd.concat(dfs)

In [None]:
df_railway_final = pd.merge(left=df, right=railway_df, left_on="station_code", right_on='Station Code', how="left")
df_railway_final.shape

In [None]:
df_railway_final.to_csv('railway_stations.csv')

In [None]:
df_railway_final['Trains passing through'] = df_railway_final['Trains passing through'].fillna(value=0).astype(int)

In [None]:
df_railway_final.sort_values(by='Trains passing through', ascending=False)

In [None]:
import geopandas as gpd
from shapely.geometry import Point

gdf = gpd.GeoDataFrame(
    data=df_railway_final,  # or any dict/DataFrame with your other attributes
    geometry=gpd.points_from_xy(df["lon"], df["lat"]),
    crs="EPSG:4326"  # set to your coordinate reference system
)
gdf = gdf.drop(['lon', "lat"], axis=1)

gdf.head()

In [None]:
gdf['network'].value_counts()

In [None]:
gdf.to_file("railway_stations.geojson")