In [21]:
import pdfplumber
import pandas as pd
import re
import os

from tqdm import tqdm
import requests

tqdm.pandas()


def extract_emails(text):
    email_pattern = r"[^\s]+@[^\s]+\.\w+"
    return re.findall(email_pattern, text)


def extract_urls(text):
    url_pattern = r"[^\s@:]+\.[a-zA-Z]{2,}"
    return re.findall(url_pattern, text)


def get_domain_from_email(email):
    return email.split("@")[1]


def domain_matches_pdf_name(domain, pdf_name):
    if "//" in domain:
        domain = domain.split("//")[1]
    domain = domain.replace("www.", "")
    if domain is None:
        return False
    pdf_name = (
        pdf_name.replace(".", "")
        .replace("-", "")
        .replace(" ", "")
        .replace("&", "")
        .lower()
    )
    domain = domain.split(".")[0].lower()
    return domain in pdf_name


def domain_exists(domain):
    if domain is None:
        return False
    if not domain.startswith("https://"):
        domain = "https://" + domain
    response = requests.get(f"https://{domain}", timeout=2)
    return response.status_code < 404


data = []

pdf_files = [f for f in os.listdir("pdfs") if f.endswith(".pdf")]

MAX = 20

for pdf_file in tqdm(pdf_files):
    with pdfplumber.open(os.path.join("pdfs", pdf_file)) as pdf:
        text = ""
        for page in pdf.pages:
            try:
                text += page.extract_text() or ""
            except:
                text += ""
        emails = extract_emails(text)
        urls = extract_urls(text)
        for email in emails:
            if email.endswith("gmail.com") or email.endswith("me.com"):
                continue
            domain = get_domain_from_email(email)

            data.append(
                {
                    "pdf_name": pdf_file,
                    "email": email,
                    "domain": domain,
                    # "domain_exists": domain_exists(domain),
                    "domain_matches_pdf_name": domain_matches_pdf_name(
                        domain, pdf_file
                    ),
                }
            )
        for url in urls:
            data.append(
                {
                    "pdf_name": pdf_file,
                    "email": None,
                    "domain": url,
                    # "domain_exists": domain_exists(url),
                    "domain_matches_pdf_name": domain_matches_pdf_name(url, pdf_file),
                }
            )
        if len(emails) == 0 and len(urls) == 0:
            data.append(
                {
                    "pdf_name": pdf_file,
                    "email": None,
                    "domain": None,
                    # "domain_exists": False,
                    "domain_matches_pdf_name": False,
                }
            )

df = pd.DataFrame(data).drop_duplicates()

df

100%|██████████| 289/289 [08:14<00:00,  1.71s/it]


Unnamed: 0,pdf_name,email,domain,domain_matches_pdf_name
0,sycuan-casino-resort-wedding-packages.pdf,groupsales@sycuan.com,sycuan.com,True
1,sycuan-casino-resort-wedding-packages.pdf,,30pm.Policies,False
2,sycuan-casino-resort-wedding-packages.pdf,,sycuan.com,True
3,River Garden Weddings & Events San Diego-merge...,info@foreverenchantedevents.com,foreverenchantedevents.com,False
4,River Garden Weddings & Events San Diego-merge...,reagan@edeneventsbyrea.com,edeneventsbyrea.com,False
...,...,...,...,...
5207,La Jolla Beach & Tennis Club-merged.pdf,,Q.Are,False
5211,the betty long beach.pdf,,,False
5212,San Juan Hills Golf Club.pdf,,www.SanJuanHillsGolf.comCASCADA,False
5213,Lindley-Scott House - 2024 Brochure.pdf,,gmail.com,False


In [34]:
def domain_exists(domain):
    if domain is None:
        return False
    if not domain.startswith("https://"):
        domain = "https://" + domain
    try:
        _ = requests.get(f"{domain}", timeout=2)
        return True
    except requests.ConnectionError:
        return False
    except:
        return False


def domain_matches_pdf_name(domain, pdf_name):
    if domain is None:
        return False
    if "//" in domain:
        domain = domain.split("//")[1]
    domain = domain.replace("www.", "")
    if domain is None:
        return False
    pdf_name = (
        pdf_name.replace(".", "")
        .replace("-", "")
        .replace(" ", "")
        .replace("&", "")
        .lower()
    )
    domain = domain.split(".")[0].lower()
    return domain in pdf_name

In [35]:
df.domain.str.startswith("www.").sum()

473

In [36]:
df["domain_matches_pdf_name"] = df.progress_apply(
    lambda x: domain_matches_pdf_name(x["domain"], x["pdf_name"]), axis=1
)

100%|██████████| 3724/3724 [00:00<00:00, 60402.91it/s]


In [37]:
domain_exists_column = (
    df[df["domain_matches_pdf_name"] == True][["pdf_name", "domain"]]
    .drop_duplicates()
    .progress_apply(lambda x: domain_exists(x["domain"]), axis=1)
)
df["domain_exists"] = domain_exists_column
df

100%|██████████| 236/236 [01:23<00:00,  2.83it/s]


Unnamed: 0,pdf_name,email,domain,domain_matches_pdf_name,domain_exists
0,sycuan-casino-resort-wedding-packages.pdf,groupsales@sycuan.com,sycuan.com,True,True
1,sycuan-casino-resort-wedding-packages.pdf,,30pm.Policies,False,
2,sycuan-casino-resort-wedding-packages.pdf,,sycuan.com,True,
3,River Garden Weddings & Events San Diego-merge...,info@foreverenchantedevents.com,foreverenchantedevents.com,False,
4,River Garden Weddings & Events San Diego-merge...,reagan@edeneventsbyrea.com,edeneventsbyrea.com,False,
...,...,...,...,...,...
5207,La Jolla Beach & Tennis Club-merged.pdf,,Q.Are,False,
5211,the betty long beach.pdf,,,False,
5212,San Juan Hills Golf Club.pdf,,www.SanJuanHillsGolf.comCASCADA,True,False
5213,Lindley-Scott House - 2024 Brochure.pdf,,gmail.com,False,


In [49]:
import os

len(os.listdir("pdfs"))

291

In [40]:
df.groupby("pdf_name")["domain_exists"].any(lambda x: x == True).sum() / len(
    df.pdf_name.unique()
)

np.float64(0.25259515570934254)

289

In [48]:
x = df.groupby("pdf_name")["domain_exists"].any(lambda x: x == True)
not_good_pdfs = x[x == False].index

df[df.pdf_name.isin(not_good_pdfs)]

Unnamed: 0,pdf_name,email,domain,domain_matches_pdf_name,domain_exists
125,the point san diego.pdf,,,False,
126,cordiano winery.pdf,,,False,
140,Santa Barbara Historical Museum.pdf,,tinyurl.com,False,
141,Julep.pdf,EMAIL:events@julepvenue.com,julepvenue.com,False,
142,Julep.pdf,,julepvenue.com,False,
...,...,...,...,...,...
5207,La Jolla Beach & Tennis Club-merged.pdf,,Q.Are,False,
5211,the betty long beach.pdf,,,False,
5212,San Juan Hills Golf Club.pdf,,www.SanJuanHillsGolf.comCASCADA,True,False
5213,Lindley-Scott House - 2024 Brochure.pdf,,gmail.com,False,
