In [5]:
import requests
import re
import json
import time
from datetime import datetime
from bs4 import BeautifulSoup

In [10]:
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-GB,en;q=0.6",
    "Origin": "https://enquiry.indianrail.gov.in",
    "Referer": "https://enquiry.indianrail.gov.in/mntes/",
    "Content-Type": "application/x-www-form-urlencoded",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
})

In [13]:
def init_session():
    """Visit site first to get JSESSIONID and TS cookies"""
    print("Initialising session...")
    session.get("https://enquiry.indianrail.gov.in/mntes/")
    print("Cookies received:", dict(session.cookies))

In [11]:
def get_csrf_token():
    timestamp = int(time.time() * 1000)  # milliseconds like JS Date.getTime()
    url = f"https://enquiry.indianrail.gov.in/mntes/GetCSRFToken?t={timestamp}"
    res = session.get(url)
    print("Raw CSRF response:", res.text)
    
    # Response is an <input> tag like:
    # <input type="hidden" name="-tudpzo3c2txi1771473038" value="-12ecys7yzjk4l29524550">
    match = re.search(r'name=["\']([^"\']+)["\'][^>]*value=["\']([^"\']+)["\']', res.text)
    if match:
        return match.group(1), match.group(2)
    raise Exception(f"Could not parse CSRF token from: {res.text}")

# name, value = get_csrf_token()
# print(f"Token name: {name}")
# print(f"Token value: {value}")


In [14]:
def get_train_schedule(train_no):
    csrf_name, csrf_value = get_csrf_token()
    today = datetime.now().strftime("%d-%b-%Y")
    
    payload = {
        "lan": "en",
        "trainNo": train_no,
        "trainStartDate": today,
        csrf_name: csrf_value
    }
    
    res = session.post(
        "https://enquiry.indianrail.gov.in/mntes/tr",
        params={"opt": "TrainServiceSchedule", "subOpt": "main", "trainNo": train_no},
        data=payload,
        timeout=15
    )
    
    print(f"Status: {res.status_code}, Length: {len(res.text)}")
    return res.text

# Run
init_session()
html = get_train_schedule("12301")
print(html[:1000])


Initialising session...
Cookies received: {'JSESSIONID': '"u2HoLGrHfcM-VpiI8EWkZIAlOr4fo5kO0mXONQlO.ntes_host2:host2_server1"', 'TS0161a678': '01ee28b4448d20dedf731448b54c437eaf1c9ee6300c3c249aba08b7f5663c93f06822d110c0bb978db06c4685858fdd5468edaeb4', 'SERVERID': 'cch7fw87sfs2', 'TS012f81d3': '01ee28b4448d20dedf731448b54c437eaf1c9ee6300c3c249aba08b7f5663c93f06822d110c0bb978db06c4685858fdd5468edaeb4'}
Raw CSRF response: <input type='hidden' name='-1kroxdda4ndi21771474982' value='1ecie66mcji1929524583'>
Status: 200, Length: 0



In [None]:
html = get_train_schedule("12301")


Raw CSRF response: <input type='hidden' name='-fmaj32brmrwe1771474488' value='17320jeepyzpv29524574'>


Stops found: []


In [15]:
def get_train_schedule(train_no):
    csrf_name, csrf_value = get_csrf_token()
    today = datetime.now().strftime("%d-%b-%Y")
    
    payload = {
        "lan": "en",
        "trainNo": train_no,
        "trainStartDate": today,
        csrf_name: csrf_value
    }
    
    res = session.post(
        "https://enquiry.indianrail.gov.in/mntes/tr",
        params={"opt": "TrainServiceSchedule", "subOpt": "main", "trainNo": train_no},
        data=payload,
        timeout=15,
        allow_redirects=True
    )
    
    print(f"Status: {res.status_code}")
    print(f"Final URL: {res.url}")
    print(f"Length: {len(res.text)}")
    
    # Check redirect history
    print("Redirect chain:")
    for r in res.history:
        print(f"  {r.status_code} -> {r.url}")
    
    # Print response headers
    print("Response headers:")
    for k, v in res.headers.items():
        print(f"  {k}: {v}")
    
    return res.text

init_session()
html = get_train_schedule("12301")

Initialising session...
Cookies received: {'JSESSIONID': '"u2HoLGrHfcM-VpiI8EWkZIAlOr4fo5kO0mXONQlO.ntes_host2:host2_server1"', 'TS0161a678': '01ee28b4448d20dedf731448b54c437eaf1c9ee6300c3c249aba08b7f5663c93f06822d110c0bb978db06c4685858fdd5468edaeb4', 'SERVERID': 'cch7fw87sfs2', 'TS012f81d3': '01ee28b4448d20dedf731448b54c437eaf1c9ee6300c3c249aba08b7f5663c93f06822d110c0bb978db06c4685858fdd5468edaeb4'}
Raw CSRF response: <input type='hidden' name='-19ndqg4wt0svk1771475054' value='m3orwt9u1x9q29524584'>
Status: 200
Final URL: https://enquiry.indianrail.gov.in/mntes/tr?opt=TrainServiceSchedule&subOpt=main&trainNo=12301
Length: 0
Redirect chain:
Response headers:
  Cross-Origin-Opener-Policy: same-origin
  Referrer-Policy: strict-origin-when-cross-origin
  Content-Security-Policy: default-src 'self' https://www.google.com; script-src 'self' 'unsafe-inline' https://ajax.googleapis.com https://www.google-analytics.com https://cdnjs.cloudflare.com https://*.google.com https://www.gstatic.com h

In [16]:
import asyncio
import json
import re
import time
from datetime import datetime
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup


In [17]:
# Load data
with open("../data/trains.json") as f:
    trains = json.load(f)

with open("../data/stations.json") as f:
    stations = json.load(f)

valid_codes = {s["code"] for s in stations}

In [18]:
async def get_train_schedule(page, train_no):
    today = datetime.now().strftime("%d-%b-%Y")
    
    # Get CSRF token
    csrf_res = await page.evaluate("""
        async () => {
            const res = await fetch('/mntes/GetCSRFToken?t=' + new Date().getTime());
            return await res.text();
        }
    """)
    
    match = re.search(r'name=["\']([^"\']+)["\'][^>]*value=["\']([^"\']+)["\']', csrf_res)
    if not match:
        raise Exception("CSRF token not found")
    
    csrf_name, csrf_value = match.group(1), match.group(2)
    
    # Submit form via fetch (same origin, so Imperva won't block)
    html = await page.evaluate(f"""
        async () => {{
            const formData = new URLSearchParams();
            formData.append('lan', 'en');
            formData.append('trainNo', '{train_no}');
            formData.append('trainStartDate', '{today}');
            formData.append('{csrf_name}', '{csrf_value}');
            
            const res = await fetch('/mntes/tr?opt=TrainServiceSchedule&subOpt=main&trainNo={train_no}', {{
                method: 'POST',
                headers: {{
                    'Content-Type': 'application/x-www-form-urlencoded',
                }},
                body: formData.toString()
            }});
            return await res.text();
        }}
    """)
    
    return html


In [20]:
async def build_connectivity():
    # edges = set()
    # failed = []
    # processed_trains = set()
    
    async with async_playwright() as p:
        # Launch browser (set headless=False to watch it work)
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"
        )
        page = await context.new_page()
        
        # Visit site once to get cookies
        print("Initialising session...")
        await page.goto("https://enquiry.indianrail.gov.in/mntes/")
        await page.wait_for_load_state("networkidle")
        print("Session ready!")
        
        # Test with one train first
        print("\nTesting with train 12301...")
        html = await get_train_schedule(page, "12301")
        # stops = parse_schedule(html, "12301")
        # print(f"Test result - Stops found: {stops}")
        print(f"Raw HTML sample: {html[:500]}")
        
        # if not stops:
        #     print("Parser needs fixing - check HTML structure above")
        #     await browser.close()
        #     return
        
        print("\nTest successful! Starting full extraction...")
        
        await browser.close()

        return html

html = await build_connectivity()

Initialising session...
Session ready!

Testing with train 12301...
Raw HTML sample: 

Test successful! Starting full extraction...


In [27]:
def parse_schedule(html):
    soup = BeautifulSoup(html, "html.parser")
    stops = []
    
    for table in soup.find_all("table"):
        first_row = table.find("tr")
        if not first_row or "Station" not in first_row.text:
            continue
        
        for row in table.find_all("tr")[1:]:  # skip header
            tds = row.find_all("td")
            if len(tds) < 6:
                continue
            
            # Sr. column
            sr = tds[0].text.strip()
            
            # Station column - two font tags (name + code)
            fonts = tds[1].find_all("font")
            station_name = fonts[0].text.strip() if len(fonts) > 0 else ""
            station_code = fonts[1].text.strip() if len(fonts) > 1 else ""
            
            # Day column
            day = tds[2].text.strip()
            
            # Arr/Dep column - two font tags
            time_fonts = tds[3].find_all("font")
            arrival   = time_fonts[0].text.strip() if len(time_fonts) > 0 else ""
            departure = time_fonts[1].text.strip() if len(time_fonts) > 1 else ""
            
            # Halt column
            halt = tds[4].text.strip()
            
            # Distance column
            distance = tds[5].text.strip()
            
            stops.append({
                "sr":           sr,
                "station_name": station_name,
                "station_code": station_code,
                "day":          day,
                "arrival":      arrival,
                "departure":    departure,
                "halt":         halt,
                "distance_km":  distance
            })
    
    return stops

# Test on saved HTML
with open("../result.html", "r", encoding="utf-8") as f:
    html = f.read()

stops = parse_schedule(html)
for stop in stops:
    print(stop)

{'sr': '1', 'station_name': 'HOWRAH JN', 'station_code': 'HWH', 'day': '1', 'arrival': 'SRC', 'departure': '16:50', 'halt': '', 'distance_km': '0'}
{'sr': '2', 'station_name': 'ASANSOL JN.', 'station_code': 'ASN', 'day': '1', 'arrival': '18:47', 'departure': '18:49', 'halt': '2 Min', 'distance_km': '200'}
{'sr': '3', 'station_name': 'DHANBAD JN', 'station_code': 'DHN', 'day': '1', 'arrival': '19:55', 'departure': '20:00', 'halt': '5 Min', 'distance_km': '258'}
{'sr': '4', 'station_name': 'PARASNATH', 'station_code': 'PNME', 'day': '1', 'arrival': '20:30', 'departure': '20:32', 'halt': '2 Min', 'distance_km': '306'}
{'sr': '5', 'station_name': 'GAYA JN', 'station_code': 'GAYA', 'day': '1', 'arrival': '22:32', 'departure': '22:35', 'halt': '3 Min', 'distance_km': '457'}
{'sr': '6', 'station_name': 'PT.DEEN DAYAL UPADHYAYA JN.', 'station_code': 'DDU', 'day': '2', 'arrival': '00:43', 'departure': '00:50', 'halt': '7 Min', 'distance_km': '658'}
{'sr': '7', 'station_name': 'PRAYAGRAJ JN', 's

In [29]:
with open("../data/trains.json") as f:
    trains = json.load(f)


print(type(trains))      # should be <class 'list'>
print(type(trains[0]))   # should be <class 'dict'>
print(trains[0])     

<class 'list'>
<class 'dict'>
{'TrainNo': '00001', 'TrainName': 'TEST FOR CCLMS'}
