In [6]:
!pip install beautifulsoup4 --quiet

In [None]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

# --- Configurations ---
BASE_URL = "https://pradan.issdc.gov.in/al1/protected/browse.xhtml?id=suit"
LOCAL_DIR = "fits_images"
CSV_FILE = "fits_image_log.csv"
USER_AGENT = "Mozilla/5.0"

# --- Ensure directory exists ---
os.makedirs(LOCAL_DIR, exist_ok=True)

# --- Initialize existing image set from directory ---
existing_files = set(os.listdir(LOCAL_DIR))

# --- Load existing CSV if exists ---
if os.path.exists(CSV_FILE):
    existing_df = pd.read_csv(CSV_FILE)
    downloaded_files = set(existing_df['filename'])
else:
    existing_df = pd.DataFrame(columns=['filename', 'url', 'date_downloaded'])
    downloaded_files = set()

# --- Request the page ---
headers = {"User-Agent": USER_AGENT}
response = requests.get(BASE_URL, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

# --- Extract .fits links ---
new_entries = []
for a_tag in soup.find_all("a", href=True):
    href = a_tag["href"]
    if href.endswith(".fits"):
        fits_url = urljoin(BASE_URL, href)
        filename = href.split("/")[-1]

        # Skip if already downloaded
        if filename in existing_files or filename in downloaded_files:
            continue

        print(f"Downloading: {filename}")
        try:
            file_data = requests.get(fits_url, stream=True)
            file_path = os.path.join(LOCAL_DIR, filename)
            with open(file_path, "wb") as f:
                for chunk in file_data.iter_content(chunk_size=8192):
                    f.write(chunk)

            # Add entry to CSV data
            new_entries.append({
                "filename": filename,
                "url": fits_url,
                "date_downloaded": datetime.now().isoformat()
            })

        except Exception as e:
            print(f"Failed to download {filename}: {e}")

# --- Append to or create CSV ---
if new_entries:
    new_df = pd.DataFrame(new_entries)
    final_df = pd.concat([existing_df, new_df], ignore_index=True)
    final_df.to_csv(CSV_FILE, index=False)
    print(f"\n‚úÖ CSV updated with {len(new_entries)} new entries.")
else:
    print("\nüìÅ No new files to download. Everything is up to date.")



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.

üìÅ No new files to download. Everything is up to date.


In [1]:
!pip install selenium --quiet

In [3]:
%pip install selenium --quiet

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time, os, requests, pandas as pd
from datetime import datetime
from urllib.parse import urljoin

# Setup
BASE_URL = "https://pradan.issdc.gov.in/al1/protected/browse.xhtml?id=suit"
LOCAL_DIR = "fits_images"
CSV_FILE = "fits_image_log.csv"
os.makedirs(LOCAL_DIR, exist_ok=True)

# Load existing files and CSV
existing = set(os.listdir(LOCAL_DIR))
df = pd.read_csv(CSV_FILE) if os.path.exists(CSV_FILE) else pd.DataFrame(columns=["filename", "url", "date_downloaded"])
downloaded = set(df.filename)

# Start headless browser
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)

# 1. Login workflow (customize this to match the actual login form)
driver.get("https://pradan.issdc.gov.in/login.xhtml")
time.sleep(2)
driver.find_element("id", "username").send_keys("jayeshpandey754@gmail.com")
driver.find_element("id", "password").send_keys("Jayesh@9930")
driver.find_element("id", "loginButton").click()
time.sleep(5)  # wait for login to complete

# 2. Browse target
driver.get(BASE_URL)
time.sleep(5)  # wait for JS to load

# 3. Extract .fits links
links = driver.find_elements("xpath", "//a[contains(@href, '.fits')]")
new_entries = []
for a in links:
    href = a.get_attribute("href")
    filename = href.split("/")[-1]
    if filename in existing or filename in downloaded:
        continue
    print("Downloading:", filename)
    resp = requests.get(href, stream=True)
    path = os.path.join(LOCAL_DIR, filename)
    with open(path, "wb") as f:
        for chunk in resp.iter_content(8192):
            f.write(chunk)
    new_entries.append({"filename": filename, "url": href, "date_downloaded": datetime.now().isoformat()})

# 4. Update CSV
if new_entries:
    df = pd.concat([df, pd.DataFrame(new_entries)], ignore_index=True)
    df.to_csv(CSV_FILE, index=False)
    print("‚úÖ Downloaded", len(new_entries), "new files.")
else:
    print("üìÅ No new files found.")

driver.quit()



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="username"]"}
  (Session info: chrome=137.0.7151.119); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x0x7ff7d8ddcda5+78885]
	GetHandleVerifier [0x0x7ff7d8ddce00+78976]
	(No symbol) [0x0x7ff7d8b99bca]
	(No symbol) [0x0x7ff7d8bf0766]
	(No symbol) [0x0x7ff7d8bf0a1c]
	(No symbol) [0x0x7ff7d8c44467]
	(No symbol) [0x0x7ff7d8c18bcf]
	(No symbol) [0x0x7ff7d8c4122f]
	(No symbol) [0x0x7ff7d8c18963]
	(No symbol) [0x0x7ff7d8be16b1]
	(No symbol) [0x0x7ff7d8be2443]
	GetHandleVerifier [0x0x7ff7d90b4eed+3061101]
	GetHandleVerifier [0x0x7ff7d90af33d+3037629]
	GetHandleVerifier [0x0x7ff7d90ce592+3165202]
	GetHandleVerifier [0x0x7ff7d8df730e+186766]
	GetHandleVerifier [0x0x7ff7d8dfeb3f+217535]
	GetHandleVerifier [0x0x7ff7d8de59b4+114740]
	GetHandleVerifier [0x0x7ff7d8de5b69+115177]
	GetHandleVerifier [0x0x7ff7d8dcc368+10728]
	BaseThreadInitThunk [0x0x7ffa8809e8d7+23]
	RtlUserThreadStart [0x0x7ffa8a0bc34c+44]


In [4]:
import os
import re
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# --- Configurations ---
SH_FILE = "suit_2025Jun23T134658908.sh"  # Replace with your actual .sh file path
DOWNLOAD_DIR = "fits_images"
CSV_LOG = "fits_image_log.csv"

# --- Step 1: Read & parse the .sh file ---
with open(SH_FILE, "r") as f:
    content = f.read()

# Extract cookies
cookie_match = re.search(r'cookies\s*=\s*"(.*?)"', content, re.DOTALL)
cookie_raw = cookie_match.group(1).strip().replace("\n", "") if cookie_match else ""
cookies = dict(item.strip().split("=", 1) for item in cookie_raw.strip(";").split(";") if "=" in item)

# Extract base URL
url_prefix = re.search(r'urlPrefix\s*=\s*"(.*?)"', content).group(1)

# Extract page URL if present
page_match = re.search(r'\$urlPrefix"([^"]+)"', content)
page_path = page_match.group(1) if page_match else "/al1/protected/browse.xhtml?id=suit"

full_url = urljoin(url_prefix, page_path)

# --- Step 2: Setup ---
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
existing_files = set(os.listdir(DOWNLOAD_DIR))

if os.path.exists(CSV_LOG):
    df_log = pd.read_csv(CSV_LOG)
    downloaded = set(df_log['filename'])
else:
    df_log = pd.DataFrame(columns=["filename", "url", "date_downloaded"])
    downloaded = set()

# --- Step 3: Fetch the HTML page ---
print(f"üîó Fetching page: {full_url}")
response = requests.get(full_url, cookies=cookies, headers={"User-Agent": "Mozilla/5.0"})

if response.status_code != 200:
    print(f"‚ùå Failed to fetch page (Status {response.status_code})")
    exit()

soup = BeautifulSoup(response.content, "html.parser")

# --- Step 4: Find and download .fits files ---
new_entries = []

for a in soup.find_all("a", href=True):
    href = a['href']
    if href.endswith(".fits"):
        fits_url = urljoin(url_prefix, href)
        filename = os.path.basename(href)

        if filename in existing_files or filename in downloaded:
            print(f"‚è© Skipping (exists): {filename}")
            continue

        print(f"‚¨áÔ∏è  Downloading: {filename}")
        try:
            r = requests.get(fits_url, stream=True, cookies=cookies)
            path = os.path.join(DOWNLOAD_DIR, filename)
            with open(path, "wb") as f:
                for chunk in r.iter_content(1024):
                    f.write(chunk)

            new_entries.append({
                "filename": filename,
                "url": fits_url,
                "date_downloaded": datetime.now().isoformat()
            })
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to download {filename}: {e}")

# --- Step 5: Update CSV log ---
if new_entries:
    df_new = pd.DataFrame(new_entries)
    df_log = pd.concat([df_log, df_new], ignore_index=True)
    df_log.to_csv(CSV_LOG, index=False)
    print(f"\n‚úÖ CSV log updated with {len(new_entries)} new files.")
else:
    print("\nüìÅ No new FITS files found.")



üîó Fetching page: https://pradan.issdc.gov.in/ch2/protected/payload.xhtml

üìÅ No new FITS files found.
