In [1]:
!pip install wayback



In [2]:
import re
import time
import wayback
import pandas as pd

import logging
import torch

from newspaper import Article, ArticleException
from pathlib import Path
from tqdm.notebook import tqdm
from glob import glob
from pathlib import Path
from tqdm.notebook import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

## Create a log of webpages when scraping

In [7]:
logger = logging.basicConfig(filename="lea-links.log", level=logging.INFO)

def get_article(url):
    try:
        logging.info('fetching url: %s', url)
        article = Article(url)
        article.download()
        article.parse()
        return article
    except ArticleException as e:
        logging.error("caught article error: {}".format(e))
        return None

article = get_article('https://www.cityofbell.org/?NavID=1')
article.title

'City of Bell'

In [None]:
# ignore any spreadsheet rows without an ID or URL
df_lea = df_lea.dropna(subset=['Ref', 'Site'])

for i, row in tqdm(list(df_lea.iterrows())):
#     lea_ref = i
    
#     text_file = Path("data") / f"{lea_ref}.txt"
#     if text_file.is_file():
#         continue

    url = row['Site']
    article = get_article(url)
#     if article:
#         text_file.open("w").write(article.text)

In [6]:
urls = []

for line in open('lea-links.log'):
    if not m := re.search(r'failed with 404 Client Error: .* on URL (.*)', line):
        urls.append(m.group(1)) 

len(urls)

FileNotFoundError: [Errno 2] No such file or directory: 'fe-links.log'

In [None]:
len(urls) / len(df_lea)

## Retrieve archived links of LEA websites, policy manuals, ME policies, and MEI


### (1) For all other links in LEA dataset (Policy, ME Policy, MEI)

In [17]:
session = wayback.WaybackSession(retries=25)
wb = wayback.WaybackClient(session)

def archived_page(url):
    time.sleep(.5)
    if pd.isna(url):  # especially for MEI column that has lots of missing links
        return '--No link for this in original dataset--'
    try:
        for result in wb.search(url):
            if result.status_code == 200:
                return result.view_url
    except wayback.exceptions.BlockedSiteError:
        return None

# archived_page(urls[0])

In [6]:
df_lea = pd.read_csv("LEA-20230424 (1).csv")

In [23]:
archived_urls = []
for url in df_lea['Site']:
    ar_url = archived_page(url)
    archived_urls.append(ar_url)
    print(ar_url)

https://web.archive.org/web/20230719150234/https://www.baldwinparkpolice.com/Search?searchPhrase=manual
https://web.archive.org/web/20130825153531/http://cityofbell.org/?navid=1
https://web.archive.org/web/20150721005207/http://www.cityofbishop.com/departments/police/
https://web.archive.org/web/20200509135050/https://hollywoodburbankairport.com/airport-authority/bgpaa-police/
https://web.archive.org/web/20010608065944/http://www.ci.carmel.ca.us/
https://web.archive.org/web/20180319152651/http://www.cerritos.edu/police/default.htm
https://web.archive.org/web/20200918234132/https://www.goldenwestcollege.edu/public-safety/index.html
https://web.archive.org/web/20230513210125/https://www.compton.edu/campus-safety/index.aspx
https://web.archive.org/web/20210622003243/https://www.corning.org/departments/police-department/
https://web.archive.org/web/20210422101349/https://www.crescentcity.org/departments/Police
https://web.archive.org/web/20201123225923/https://www.csum.edu/police-departmen

KeyboardInterrupt: 

In [24]:
urls = pd.DataFrame({"ref": df_lea['Ref'], "lea name": df_lea['LEA_Name'], "url": archived_urls})
urls

Unnamed: 0,ref,lea name,url
0,SB978-703,Baldwin Park Police Department,https://web.archive.org/web/20230719150234/htt...
1,SB978-704,Bell Police Department,https://web.archive.org/web/20130825153531/htt...
2,SB978-705,Bishop Police Department,https://web.archive.org/web/20150721005207/htt...
3,SB978-706,Burbank-Glendale-Pasadena Airport Authority Po...,https://web.archive.org/web/20200509135050/htt...
4,SB978-710,Carmel Police Department,https://web.archive.org/web/20010608065944/htt...
...,...,...,...
491,SB978-627,Yolo County District Attorney,https://web.archive.org/web/20010405052338/htt...
492,SB978-628,Yolo County Sheriff's Department,https://web.archive.org/web/20020928191924/htt...
493,SB978-629,Yreka Police Department,https://web.archive.org/web/20090310232942/htt...
494,SB978-630,Yuba City Police Department,https://web.archive.org/web/20160402115655/htt...


In [4]:
urls.to_csv('lea_archived_links.csv', encoding='utf-8')

NameError: name 'urls' is not defined

### (2) For all other links in LEA dataset (Policy, ME Policy, MEI)

In [18]:
archived_policy_urls = []
for url in df_lea['QA_Policy_Manual']:
    ar_url = archived_page(url)
    archived_policy_urls.append(ar_url)
    print(ar_url)

https://web.archive.org/web/20210420203617/https://www.baldwinparkpolice.com/DocumentCenter/View/96/Baldwin-Park-Police-Department-Policy-Manual-PDF
https://web.archive.org/web/20230719150156/https://www.cityofbell.org/home/showdocument?id=13315
None
None
https://web.archive.org/web/20211026174554/https://ci.carmel.ca.us/sites/main/files/file-attachments/carmel_by_the_sea_pd_policy_manual.pdf?1621872907
https://web.archive.org/web/20221203192654/https://www.cerritos.edu/police/_includes/docs/RELEASE_20220222_T115017_Cerritos_College_PD_Policy_Manual.pdf
https://web.archive.org/web/20201202163950/https://www.goldenwestcollege.edu/Links/pdf/RELEASE_20190924_T141046_Coast_Community_College_Police_Department_Protocol_Manual.pdf
https://web.archive.org/web/20220526045259/https://www.compton.edu/adminandoperations/campuspolice/Documents/CCPD_Policy-Manual.pdf
https://web.archive.org/web/20211024122411/https://www.corning.org/documents/policy-manual/
https://web.archive.org/web/20220927111354

https://web.archive.org/web/20211024120635/https://www.beaumontpd.org/DocumentCenter/View/37037/Beaumont-Police-Department-Policy-PDF
https://web.archive.org/web/20230719150405/https://www.bellgardens.org/home/showpublisheddocument/5591/637938257880670000
https://web.archive.org/web/20220303092615/https://www.belmont.gov/home/showpublisheddocument/18935/637632371657830000
https://web.archive.org/web/20230326184140/https://www.cityofbelvedere.org/DocumentCenter/View/7587/Policy-207---Police-Policy-Manual
None
https://web.archive.org/web/20221119020336/https://berkeleyca.gov/sites/default/files/documents/RELEASE_20220921_T152222_Berkeley_PD_Policy_Manual.pdf
https://web.archive.org/web/20200921002238/https://www.beverlyhills.org/cbhfiles/storage/files/1787600804952259300/BHPDManual(2-2020).pdf
https://web.archive.org/web/20210427001434/https://www.cityofblythe.ca.gov/DocumentCenter/View/1465/Blythe-Police-Department-Policy-Manual
https://web.archive.org/web/20230719150426/https://www.bra

https://web.archive.org/web/20220107222708/https://csumb.edu/media/csumb/section-editors/university-police/CSU_Monterey_Bay_PD_CA_Policy_Manual.pdf
https://web.archive.org/web/20210415032213/https://www.csun.edu/police/policy-manual-0
https://web.archive.org/web/20230719155039/https://www.csus.edu/campus-safety/police-department/_internal/_documents/public__csu_sacramento_pd_policy_manual.pdf
https://web.archive.org/web/20200812095139/https://www.csusb.edu/police/public-information/csusb-university-police-policies
https://web.archive.org/web/20210415041720/https://police.sdsu.edu/about/sdsupd_manual.pdf
https://web.archive.org/web/20210923171439/https://www.sjsu.edu/police/docs/forms/UPD%20Policy%20Manual.pdf
https://web.archive.org/web/20210117185451/https://www.csusm.edu/police/policepolicymanual.pdf
https://web.archive.org/web/20220415201734/https://police.sonoma.edu/sites/police/files/images/release_20211005_t155608_sonoma_state_university_police_policy_manual.pdf
https://web.archi

https://web.archive.org/web/20211102154922/https://www.hayward-ca.gov/sites/default/files/documents/Hayward-PD-Policy-Manual-2106.pdf
https://web.archive.org/web/20210430063230/https://www.ci.healdsburg.ca.us/DocumentCenter/View/11325/Healdsburg_PD_Policy_Manual
https://web.archive.org/web/20210118202104/https://www.hemetca.gov/DocumentCenter/View/6188/Hemet_PD_Policy_Manual--Update-release-12242019
https://web.archive.org/web/20230719155157/https://www.ci.hercules.ca.us/home/showpublisheddocument/13285/637492559842770000
https://web.archive.org/web/20210427031604/https://www.hermosabeach.gov/Home/ShowDocument?id=13444
https://web.archive.org/web/20200510191218/https://www.hillsborough.net/DocumentCenter/View/3774/HPD-Policy-Manual-?bidId=
https://web.archive.org/web/20200318154248/http://hollister.ca.gov/wp-content/uploads/2020/01/Hollister_PD_Policy_Manual-RELEASE_20191226.pdf
https://web.archive.org/web/20200626041125/https://humboldtgov.org/DocumentCenter/View/86838/HCSO-Policy-Man

https://web.archive.org/web/20211114131044/https://www.cityofmillvalley.org/DocumentCenter/View/474/MVPD-Policy-Manual-2019-PDF
https://web.archive.org/web/20220423033810/https://www.milpitas.gov/_pdfs/MilpitasPolicePolicy.pdf
https://web.archive.org/web/20210301161131/https://miracosta.edu/administrative/college-police/_docs/MCPD%20PP%20Manual.pdf
https://web.archive.org/web/20210119213404/https://modestogov.com/DocumentCenter/View/15365/Modesto-Police-Department-Policy-Manual-Printable-PDF
None
https://web.archive.org/web/20210415235626/https://monosheriff.org/sites/default/files/fileattachments/sheriff_-_coroner/page/8062/mono_county_sheriff_patrol_policy_manual_.pdf
https://web.archive.org/web/20230719155245/https://www.cityofmonrovia.org/home/showpublisheddocument/27766/637824223984130000
https://web.archive.org/web/20220812114347/https://storage.googleapis.com/proudcity/montclairca/uploads/2022/05/Policy-051622-CURRENT.pdf
None
https://web.archive.org/web/20210319083704/http://ww

https://web.archive.org/web/20220624233742/https://rialtopolice.com/wp-content/uploads/2022/05/rialtopd-policy-2022-03-16.pdf
https://web.archive.org/web/20210429042714/https://www.ci.richmond.ca.us/DocumentCenter/View/51870/Policy-Manual-12312019
https://web.archive.org/web/20210427001848/https://www.ridgecrest-ca.gov/DocumentCenter/View/6221/RPD-Policy-and-Procedure-Manual
None
https://web.archive.org/web/20210415085136/https://riponpd.org/wp-content/uploads/2021/02/RELEASE_20210205_T164435_Ripon_PD_Policy_Manual.pdf
https://web.archive.org/web/20210415235628/https://www.rccd.edu/police/Documents/manuals/Riverside_Community_College_District_Safety_and_Police_Department__Manual.pdf
https://web.archive.org/web/20220811022734/https://www.riversidesheriff.org/DocumentCenter/View/6956/Department-Standards-Manual-71522
https://web.archive.org/web/20201017014154/https://riversideca.gov/rpd/sites/riversideca.gov.rpd/files/pdf/2020/RELEASE_20200317_T100316_Riverside_PD_Policy_Manual.pdf
https

https://web.archive.org/web/20230719151650/https://www.simivalley.org/home/showpublisheddocument/24304/637976364208530000
https://web.archive.org/web/20210415033707/https://www.co.siskiyou.ca.us/sites/default/files/fileattachments/enforcement_division/page/1491/so_20191231_policymanual.pdf
https://web.archive.org/web/20230719155445/https://public.powerdms.com/SolanoCounty/tree
https://web.archive.org/web/20220630050705/https://www.cityofsoledad.com/download/soledad-pd-policy-manual/
https://web.archive.org/web/20221203203336/https://sonomacounty.ca.gov/Microsites/Permit%20Sonoma/Documents/Archive/Misc/Code-of-Conduct-ADA.pdf
https://web.archive.org/web/20230719155447/https://static1.squarespace.com/static/542ec317e4b0d41ade8801fb/t/5fea1c3a58d2771921767265/1609178182899/Manual+12-23-2020.pdf
https://web.archive.org/web/20230719155447/https://static1.squarespace.com/static/542ec317e4b0d41ade8801fb/t/5fea1c3a58d2771921767265/1609178182899/Manual+12-23-2020.pdf
https://web.archive.org/web

https://web.archive.org/web/20220103070924/https://www.cityofwoodland.org/DocumentCenter/View/7572/Woodland_PD_Policy_Manual-12_27_2021_Redacted?bidId=
https://web.archive.org/web/20230719155541/https://yoloda.org/wp-content/uploads/2020/06/RELEASE_20200612_T141909_Yolo_County_DA_Policy_Manual.pdf
https://web.archive.org/web/20230719155539/https://www.yolocountysheriff.com/wp-content/uploads/2020/03/Sheriffs-Office-Policy-Manual.pdf
https://web.archive.org/web/20210427221233/http://ci.yreka.ca.us/DocumentCenter/View/824/Yreka_PD_Policy_Manual
https://web.archive.org/web/20210427044004/https://www.yubacity.net/common/pages/DisplayFile.aspx?itemId=17298234
None


In [19]:
archived_me_pol_urls = []
for url in df_lea['QA_Mil_Equip_Policy']:
    ar_url = archived_page(url)
    archived_me_pol_urls.append(ar_url)
    print(ar_url)

None
--No link for this in original dataset--
None
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
https://web.archive.org/web/20220627075133/https://www.corning.org/documents/military-equipment-policy/
https://web.archive.org/web/20220927111354/https://www.crescentcity.org/media/Police/RELEASE_20220811_T170935_Crescent_City__Police_Department_Manual.pdf
--No link for this in original dataset--
--No link for this in original dataset--
https://web.archive.org/web/20230719154932/https://www.csustan.edu/sites/default/files/groups/University%20Police%20Department/documents/stanislaus-state-pd-policy-manual-2022.pdf
--No link for this in original dataset--
https://web.archive.org/web/20221101025432/https://www.fusd.net/cms/lib/CA50000190/Centricity/Domain/245/Military%20Equipment.pdf
https://web.archive.org/web/20221001051727/https://po

https://web.archive.org/web/20210415041431/https://www.4cd.edu/pd/docs/contra-costa-community-college-pd-manual.pdf
None
https://web.archive.org/web/20230719155027/https://www.cocosheriff.org/home/showpublisheddocument/442/637873667474700611
None
None
https://web.archive.org/web/20221201225849/https://www.coronado.ca.us/DocumentCenter/View/1166/AB-481-PDF
None
https://web.archive.org/web/20210306012203/https://police.losrios.edu/lrpd/doc/policy-manual.pdf
None
https://web.archive.org/web/20230719155032/https://covinapd.org/wp-content/uploads/2022/09/covinapd-policy-manual-2022-09.pdf
--No link for this in original dataset--
https://web.archive.org/web/20221010233724/https://afd.calpoly.edu/police/police-administration/policies/700/709-regulated-equipment-tools.pdf
None
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for t

--No link for this in original dataset--
https://web.archive.org/web/20230326034119/https://cityoflapalma.org/DocumentCenter/View/11425/Military-Equipment-DRAFT-Policy?bidId=
None
None
https://web.archive.org/web/20220523032150/http://www.lakesheriff.com/Assets/Sheriff+Site/Public+Resources/Use+of+Force/LCSO+Policies/militaryequipment.pdf
None
--No link for this in original dataset--
https://web.archive.org/web/20230325105542/https://lemoore.com/wp-content/uploads/2022/06/Lemoore_PD_Policy_Manual-2022.pdf
https://web.archive.org/web/20220402074528/https://www.lincolnca.gov/en/living-here/Public-Safety/AB-481.pdf
--No link for this in original dataset--
None
None
None
None
https://web.archive.org/web/20220509223650/https://www.longbeach.gov/globalassets/police/media-library/documents/about-the-lbpd/ab-481/military-equipment-so
https://web.archive.org/web/20230201194509/http://www.cityoflosalamitos.org/DocumentCenter/View/1952/Policy-709---AB-481-Military-Equipment-Policy
None
--No link 

https://web.archive.org/web/20230320042952/https://www.cosb.us/home/showpublisheddocument/8963/637946100730970000
https://web.archive.org/web/20230324191115/https://wp.sbcounty.gov/sheriff/wp-content/uploads/sites/17/Military-Equipment-Use-Policy-52422.pdf
None
https://web.archive.org/web/20220527070240/https://public.powerdms.com/SanBrunoPD/documents/2405067
None
https://web.archive.org/web/20220525121659/https://pantheonstorage.blob.core.windows.net/public-safety/Port-of-San-Diego-Harbor-Police-Policy-712-Military-Equipment-Use.pdf
https://web.archive.org/web/20220210223700/https://www.sandiego.gov/sites/default/files/ab_481_procedure_public_draft.pdf
https://web.archive.org/web/20220807155320/https://ci.san-fernando.ca.us/wp-content/uploads/2022/04/Military_Equipment-draft-042722.pdf
None
https://web.archive.org/web/20221002212513/https://www.sanfranciscopolice.org/your-sfpd/policies/law-enforcement-equipment-use-policy
None
--No link for this in original dataset--
https://web.archi

None
--No link for this in original dataset--
https://web.archive.org/web/20221129133209/https://www.wcpd.org/wp-content/uploads/2022/07/20220708_West_Covina_PD_Policy_Manual_Redacted.pdf
None
None
None
https://web.archive.org/web/20220803152213/http://www.wheatland.ca.gov/wp-content/uploads/RELEASE_20220705_T120950_Wheatland_PD_Policy_Manual.pdf
https://web.archive.org/web/20230719151842/https://www.cityofwhittier.org/home/showpublisheddocument/6434/637861240116070000
--No link for this in original dataset--
--No link for this in original dataset--
https://web.archive.org/web/20220809023609/http://www.winterspolice.org/wp-content/uploads/2022/07/RELEASE_20220705_T165602_Winters_Police_Department_Policy_Manual.pdf
https://web.archive.org/web/20220402152832/https://www.cityofwoodland.org/DocumentCenter/View/7793/Military-Equipment-Policy-706-DRAFT?bidId=
--No link for this in original dataset--
--No link for this in original dataset--
None
None
--No link for this in original dataset--


In [20]:
archived_mei_urls = []
for url in df_lea['QA Mil_Equip_Inventory (if separate)']:
    ar_url = archived_page(url)
    archived_mei_urls.append(ar_url)
    print(ar_url)

None
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link f

--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
None
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
None
https://web.archive.org/web/20220816163245/https://www.hayward-ca.gov/sites/default/files/police/AB-481-HPD-Equipment-List-%28PDF-Final%29.pdf
https://web.archive.org/web/20220930035307/https://www.ci.healdsburg.ca.us/DocumentCenter/View/13994/Healdsburg_Police_Military_Use_Policy_Inventory_List?bidId=
--No link for this in original dataset--
--No link for this in original dataset--
None
--No link for this in original dataset--
https://web.archive.org/w

--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
https://web.archive.org/web/20220819150806/https://santabarbaraca.gov/sites/default/files/documents/Police/Military%20Equipment%20Inventory%20Per%20Assembly%20Bill%20No.%20481/SBPD%20Military%20Equipment%20Inventory%202022.pdf
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original dataset--
--No link for this in original data

In [13]:
empty_link = df_lea['QA Mil_Equip_Inventory (if separate)'][1]
print(pd.isna(empty_link))

True


In [24]:
urls = pd.DataFrame({"ref": df_lea['Ref'], "lea name": df_lea['LEA_Name'], "policy url": archived_policy_urls, "me policy url": archived_me_pol_urls, "mei url (if separate)": archived_mei_urls})
urls

Unnamed: 0,ref,lea name,policy url,me policy url,mei url (if separate)
0,SB978-703,Baldwin Park Police Department,https://web.archive.org/web/20210420203617/htt...,,
1,SB978-704,Bell Police Department,https://web.archive.org/web/20230719150156/htt...,--No link for this in original dataset--,--No link for this in original dataset--
2,SB978-705,Bishop Police Department,,,--No link for this in original dataset--
3,SB978-706,Burbank-Glendale-Pasadena Airport Authority Po...,,--No link for this in original dataset--,--No link for this in original dataset--
4,SB978-710,Carmel Police Department,https://web.archive.org/web/20211026174554/htt...,--No link for this in original dataset--,--No link for this in original dataset--
...,...,...,...,...,...
491,SB978-627,Yolo County District Attorney,https://web.archive.org/web/20230719155541/htt...,--No link for this in original dataset--,--No link for this in original dataset--
492,SB978-628,Yolo County Sheriff's Department,https://web.archive.org/web/20230719155539/htt...,--No link for this in original dataset--,https://web.archive.org/web/20230518043801/htt...
493,SB978-629,Yreka Police Department,https://web.archive.org/web/20210427221233/htt...,,https://web.archive.org/web/20220709133908/htt...
494,SB978-630,Yuba City Police Department,https://web.archive.org/web/20210427044004/htt...,,--No link for this in original dataset--


In [25]:
urls.to_csv('lea_all_archived_links.csv', encoding='utf-8')