In [2]:
import os
import pandas as pd
from dotenv import load_dotenv, dotenv_values
from zenrows import ZenRowsClient
import requests
from multiprocessing.pool import ThreadPool

In [3]:
path = "~/Documents/secret_lives_pa/download_pdfs/output/"

court_cases_df = pd.read_csv(
    path + "court_cases_df.csv.gz",
    compression = "gzip",
    dtype = {"incident_nr": "string", "event_type": "string", "event_status": "string", "event_date": "string", "event_location": "string"}
)

In [4]:
non_missing_links = court_cases_df.loc[
    court_cases_df['docket_sheet_link'].notnull() &
    court_cases_df['court_summary_link'].notnull()
]

In [None]:
print(pd.isna(non_missing_links["docket_sheet_link"]).value_counts())
print(pd.isna(non_missing_links["court_summary_link"]).value_counts())

In [6]:
docket_sheet_urls = non_missing_links["docket_sheet_link"].to_list()
courty_summary_urls = non_missing_links["court_summary_link"].to_list()
docket_nrs = non_missing_links["docket_number"].to_list()
county = non_missing_links["county"].to_list()

pdf_file_names_ds = [i + "_" + j + "_DS.pdf" for i, j in zip(county, docket_nrs)]
pdf_file_names_cs = [i + "_" + j + "_CS.pdf" for i, j in zip(county, docket_nrs)]

In [None]:
def download_pdf(url, name, key):
    response = requests.get(
        url = url,
        params = {
            "url": url,
            "apikey": key
        }
    )
    print(response.status_code)

    with open(name, 'wb') as f:
        f.write(response.content)

In [18]:
load_dotenv()
zenrows_api_key = os.getenv("ZENROWS_API_KEY")

In [24]:
client = ZenRowsClient(zenrows_api_key)
concurrency = 5
url = "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D"

urls = [
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D",
    "https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D"
]

In [14]:
names = [
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case1.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case2.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case3.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case4.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case5.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case6.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case7.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case8.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case9.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case10.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case11.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case12.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case13.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case14.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case15.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case16.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case17.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case18.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case19.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case20.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case21.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case22.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case23.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case24.pdf',
    '/home/joe/Documents/secret_lives_pa/scrape_links/output/case25.pdf'
]

In [23]:
api_keys = [
    zenrows_api_key, zenrows_api_key, zenrows_api_key, zenrows_api_key, zenrows_api_key,
    zenrows_api_key, zenrows_api_key, zenrows_api_key, zenrows_api_key, zenrows_api_key,
    zenrows_api_key, zenrows_api_key, zenrows_api_key, zenrows_api_key, zenrows_api_key,
    zenrows_api_key, zenrows_api_key, zenrows_api_key, zenrows_api_key, zenrows_api_key,
    zenrows_api_key, zenrows_api_key, zenrows_api_key, zenrows_api_key, zenrows_api_key
]

In [4]:
response = client.get(url)

In [42]:
response = requests.get("https://communicate.gse.harvard.edu/files/commlab/files/_structure_of_a_paper.pdf")

user_agent = "scrapping_script/1.0"
headers = {'User-Agent': user_agent}
response = requests.get("https://ujsportal.pacourts.us/Report/CpDocketSheet?docketNumber=MC-51-SU-0000025-2020&dnh=xKIQV6bWbABDWSq%2FxjNhog%3D%3D", headers=headers)
print(response.status_code)

200


In [9]:
with open('/home/joe/Documents/secret_lives_pa/scrape_links/output/case1.pdf', 'wb') as f:
    f.write(response.content)

In [29]:
def download_pdf(url, name, key):
    response = requests.get(
        url = url,
        params = {
            "url": url,
            "apikey": key
        }
    )
    print(response.status_code)

    with open(name, 'wb') as f:
        f.write(response.content)

In [37]:
#download_pdf(url, names[0], api_keys[0])

response = requests.get(
        url = urls[0],
        #params = {
        #    "url": urls[0],
        #    #"apikey": api_keys[0]
        #}
    )

print(response.status_code)

#with open(names[0], 'wb') as f:
    #f.write(response.content)

response = requests.get("https://www.google.com")
print(response.status_code)

401
200


In [27]:
pool = ThreadPool(concurrency)
pool.starmap(download_pdf, zip(urls, names, api_keys))
#pool.starmap(func, zip(a_args, repeat(second_arg)))
pool.close()
pool.join()

401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
401
