In [1]:
import glob
import os
import re
import requests

import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm


In [2]:

catalog = []
for fname in tqdm(glob.glob(os.path.expanduser("~/data/texas_air/tceq/tceq_all*_*.html"))):
    with open(fname) as fp:
        soup = BeautifulSoup(fp)
        catalog.extend([d.parent.parent for d in soup.find_all("div", class_="xuiDisplayText_Sm") if d.text.strip() == "MAERT"])


100%|██████████| 74/74 [00:18<00:00,  3.93it/s]


In [3]:
def get_rn_number(link):
    response = requests.get(link)
    content = BeautifulSoup(response.content)
    rn = [str(td) for td in content.find_all("td")]

    # join all strings in the list into a single string
    rn_string = ' '.join(rn)

    # search for the RN number
    rn_number_match = re.search(r'RN\d{9}', rn_string)

    # if a match is found, get the matched string
    rn_number = rn_number_match.group() if rn_number_match else None
    return rn_number

In [4]:
links = []
rows = []
action_links = []
for row in tqdm(catalog):
    link = ["https://records.tceq.texas.gov" + l["href"] 
                      for l in row.find_all("a")  
                      if "TCEQ_EXTERNAL_SEARCH_GET_FILE" in l["href"]]
    if not len(link):
        continue
    links.append(link[0])
    rows.append([d.text.strip() for d in row.find_all("div", class_="xuiDisplayText_Sm")][1:-1])
    rn = get_rn_number("https://records.tceq.texas.gov" + row.find_all("div", class_="xuiDisplayText_Sm")[-1].find("a")["href"])
    rows[-1] = [rn] + rows[-1]


df = pd.DataFrame(
    rows, 
    columns="""RN Number
Content ID
Record Series
Primary ID    
Secondary ID    
Document Type    
Title    
Begin Date    
End Date
Litigation Hold
Regulated Entity Name
Media
Description
Security Group""".split("\n")
).drop_duplicates()
    

100%|██████████| 2345/2345 [24:14<00:00,  1.61it/s]  


In [8]:

df.to_csv(os.path.expanduser("~/data/texas_air/tceq/tceq_all_metadata.csv"), index=False)


In [5]:
import re
len(links), len(set(links)), links[0]

(2292,
 1754,
 'https://records.tceq.texas.gov/cs/idcplg?IdcService=TCEQ_EXTERNAL_SEARCH_GET_FILE&dID=5296975&Rendition=Web')

In [6]:
with open("fetch_pdfs.sh", "w") as fp:
    for i, link in enumerate(links):
        id_match = re.search(r"dID=(\d+)", link)
        if id_match:    
            id = id_match.group(1)
        else:
            id = f"UNK_{i}"
        fname = os.path.expanduser(f"~/data/texas_air/tceq/pdf/{id}.pdf")
        fp.write(f"wget \"{link}\" -O {fname}\nsleep 0.5\n")

In [7]:
df.drop_duplicates().shape

(1754, 14)