In [1]:
%load_ext jupyter_black

In [19]:
import time
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup


def f(df: pd.DataFrame, base_url: str) -> pd.DataFrame:
    df.index = base_url + df["path"]
    return df


class TSragr:
    def __init__(self, data: str = None) -> None:

        if isinstance(data, str):
            self._baseurl = data

            r = requests.get(data)
            r.raise_for_status()

            soup = BeautifulSoup(r.content, "lxml").find_all("a")

            if soup[0].text == "Parent Directory":
                soup = soup[1:]

            self._soup = pd.DataFrame([x.text for x in soup], columns=["path"]).pipe(f, base_url=data)

    def __repr__(self) -> str:
        return f"{self._baseurl}\n" + self._soup.__repr__()

    def __getitem__(self, args) -> "TSragr":
        self._soup = self._soup[args]
        return self

    @property
    def url(self):
        return self._baseurl + "/"

    def navto(self, *args: str) -> "TSragr":
        return TSragr(self._baseurl + "/".join(args))

    def navup(self) -> "TSragr":
        return TSragr(re.match(r"^(.*[\/])", self._baseurl).group())

    def inav(self, index: int):
        return TSragr(self.url + self._soup[index])

    def download(self, save_to="./", wait: float = 10):

        soup = self._soup.copy()
        soup.index = self.url + self._soup

        for url, filename in soup.items():
            print("DOWNLAODING FILE")
            local_filename = save_to + filename
            with requests.get(url, stream=True) as r:
                r.raise_for_status()
                with open(local_filename, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)

            print("FILE SAVED")
            time.sleep(60 * wait)


ragr = TSragr("https://nomads.ncep.noaa.gov/pub/data/nccf/")

ragr

https://nomads.ncep.noaa.gov/pub/data/nccf/
                                                               path
path                                                               
https://nomads.ncep.noaa.gov/pub/data/nccf/charts/          charts/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/                com/
https://nomads.ncep.noaa.gov/pub/data/nccf/dcom/              dcom/
https://nomads.ncep.noaa.gov/pub/data/nccf/nono...  nonoperational/
https://nomads.ncep.noaa.gov/pub/data/nccf/pcom/              pcom/
https://nomads.ncep.noaa.gov/pub/data/nccf/radar/            radar/

In [56]:
url = "https://nomads.ncep.noaa.gov/pub/data/"
r = requests.get(url)
r.raise_for_status()

soup = BeautifulSoup(r.content, "lxml").find_all("a")
import numpy as np


def read_directory(df: pd.DataFrame) -> pd.DataFrame:
    def generate():
        base = ("https://nomads.ncep.noaa.gov/pub/", "data/")
        index =[]
        for path in df["path"]:
            r = requests.get(path)
            r.raise_for_status()
            soup = BeautifulSoup(r.content, "lxml").find_all("a")
            paths = []
            files = []
            for x in soup:
                text: str = x.text
                if text != "Parent Directory":
                    if text.endswith("/"):
                        paths.append(text)
                    else:
                        files.append(text)
            yield {"paths": paths, "files": files}
            # yield {"PATH" if x.text.endswith("/") else "FILE": x.text for x in soup if x.text != "Parent Directory"}

    return pd.DataFrame(
        tuple(generate()), index=pd.MultiIndex.from_tuples([("https://nomads.ncep.noaa.gov/pub/", "data/")])
    )  # pd.DataFrame(tuple(generate()), index=[url], axis=1)


# def read():

pd.DataFrame([url], columns=["path"]).pipe(read_directory)

Unnamed: 0,Unnamed: 1,paths,files
https://nomads.ncep.noaa.gov/pub,data/,[nccf/],[DSRC]


In [3]:

import time
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
path = "https://nomads.ncep.noaa.gov/pub/data/"


def soup_handle(soup: list[str]):
    for url in soup:
        if url.endswith("/"):
            # print(url)
            ...
            yield tuple(get_path([url]))
        else:
            yield url


def get_path(paths):
    for path in paths:
        print(path)
        r = requests.get(path)
        r.raise_for_status()
        soup = BeautifulSoup(r.content, "lxml").find_all("a")
        yield tuple(soup_handle([path + x.text for x in soup if x.text != "Parent Directory"]))


e = tuple(get_path(["https://nomads.ncep.noaa.gov/pub/data/"]))
e

https://nomads.ncep.noaa.gov/pub/data/
https://nomads.ncep.noaa.gov/pub/data/nccf/
https://nomads.ncep.noaa.gov/pub/data/nccf/charts/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/557ww/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/557ww/prod/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/557ww/prod/557ww.20220604/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/557ww/prod/557ww.20220605/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/amsu_estimation/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/amsu_estimation/v1.3/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/amsu_estimation/v1.3/amsu_estimation.20220520/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/amsu_estimation/v1.3/amsu_estimation.20220520/io932022/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/amsu_estimation/v1.3/amsu_estimation.20220520/sh262022/
https://nomads.ncep.noaa.gov/pub/data/nccf/com/amsu_estimation/v1.3/amsu_estimation.20220520/wp912022/
https://nomads.ncep.n

KeyboardInterrupt: 