In [None]:
import pandas as pd
import requests
from PyPDF2 import PdfReader
from io import BytesIO
import fitz  # PyMuPDF
from tqdm import tqdm
from transformers import pipeline

In [3]:
df = pd.read_csv("professional_only.csv")

In [4]:
df

Unnamed: 0,Category,PDF Link
0,Professional,https://www.aim-ahead.net/media/hd3fws1l/dstc-...
1,Professional,https://www.aim-ahead.net/media/3cvd4x4g/pair-...
2,Professional,https://www.aim-ahead.net/media/svfjvjr5/aihes...
3,Professional,https://www.aim-ahead.net/media/5q3hmb5b/webin...
4,Professional,https://www.aim-ahead.net/media/32ena1ts/intro...
...,...,...
1449,Professional,https://www.multiperspectivesjournal.com/uploa...
1450,Professional,https://www.cureus.com/articles/281861-a-revie...
1451,Professional,https://www.inovigate.com/media/filer_public/6...
1452,Professional,https://legacy.himss.org/sites/hde/files/media...


In [7]:
def extract_text_from_pdf_url(url):
    try:
        response = requests.get(url)
        pdf = PdfReader(BytesIO(response.content))
        text = ""
        for page in pdf.pages[:3]:
            text += page.extract_text()
        return text
    except Exception as e:
        return f"ERROR: {e}"

In [None]:
def classify_text(text):
    text_lower = text.lower()
    if "university" in text_lower or "institute" in text_lower:
        return "academic"
    elif "foundation" in text_lower or "non-profit" in text_lower or "ngo" in text_lower:
        return "nonprofit"
    else:
        return "undecided"

In [None]:
df['text'] = df['PDF Link'].apply(extract_text_from_pdf_url)
df['sub_category'] = df['text'].apply(classify_text)

Multiple definitions in dictionary at byte 0xfbf10 for key /PageMode
Multiple definitions in dictionary at byte 0xb406b for key /PageMode
incorrect startxref pointer(1)
unknown widths : 
[0, IndirectObject(9848, 0, 4474764496), 1, 2, 0, 3, IndirectObject(9849, 0, 4474764496), 4, 14, 606, 15, 16, 775, 17, IndirectObject(9850, 0, 4474764496), 18, 23, 529, 24, 25, 630, 26, 27, 639, 28, 37, 488, 38, IndirectObject(9851, 0, 4474764496), 39, 43, 637, 44, 45, 631, 46, IndirectObject(9852, 0, 4474764496), 47, 56, 267, 57, IndirectObject(9853, 0, 4474764496), 58, 59, 331, 60, 61, 547, 62, 63, 423, 64, IndirectObject(9854, 0, 4474764496), 69, 73, 659, 74, IndirectObject(9855, 0, 4474764496), 75, 83, 676, 84, 85, 681, 86, IndirectObject(9856, 0, 4474764496), 87, 88, 532, 89, IndirectObject(9857, 0, 4474764496), 90, 93, 563, 94, 99, 473, 100, 103, 495, 104, 114, 653, 115, IndirectObject(9858, 0, 4474764496), 116, 120, 906, 121, IndirectObject(9859, 0, 4474764496), 122, 126, 520, 127, 130, 478, 131

In [13]:
df['sub_category'].value_counts()

sub_category
undecided    814
academic     496
nonprofit    144
Name: count, dtype: int64

In [None]:
# keywords for classification
academic_kw = ['university', 'college', 'professor', 'grant', 'poster', 'Ph.D.', 'R01', 'fellowship']
nonprofit_kw = ['foundation', 'outreach', 'community', '501(c)(3)', 'underserved', 'nonprofit']

# text extraction
def extract_text_from_url(url, max_chars=2000):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        doc = fitz.open(stream=BytesIO(response.content), filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
            if len(text) >= max_chars:
                break
        return text.lower()
    except Exception as e:
        return f"__ERROR__: {e}"

# classification function
def classify(text):
    if "__ERROR__" in text:
        return "error"
    a_score = sum(k in text for k in academic_kw)
    n_score = sum(k in text for k in nonprofit_kw)
    if a_score > n_score:
        return 'academic'
    elif n_score > a_score:
        return 'nonprofit'
    else:
        return 'uncertain'

# time check
new_categories = []
for url in tqdm(df['PDF Link'], desc="Classifying PDF documents"):
    text = extract_text_from_url(url)
    label = classify(text)
    new_categories.append(label)

df['Category'] = new_categories

# Results
df.to_csv('reclassified_pdfs.csv', index=False)

Classifying PDF documents: 100%|██████████| 1454/1454 [48:12<00:00,  1.99s/it]   


In [None]:
df1 = pd.read_csv('reclassified_pdfs.csv')
df1['Category'].value_counts()

Category
uncertain    640
academic     400
nonprofit    217
error        197
Name: count, dtype: int64

In [None]:
# CSV
df = pd.read_csv("professional_only.csv").dropna(subset=["PDF Link"])

# labels for zero-shot classification
labels = ["nonprofit", "academic institution", "professional medical organization"]

# HuggingFace model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# text extraction from PDF URLs
def extract_text_from_pdf_url(url, max_chars=2000):
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        doc = fitz.open(stream=BytesIO(response.content), filetype="pdf")
        text = ""
        for page in doc:
            text += page.get_text()
            if len(text) >= max_chars:
                break
        return text.strip()
    except Exception as e:
        return f"__ERROR__: {e}"

results = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    url = row["PDF Link"]
    text = extract_text_from_pdf_url(url)

    if "__ERROR__" in text or len(text.strip()) == 0:
        label = "error"
    else:
        try:
            result = classifier(text[:1000], candidate_labels=labels)
            label = result["labels"][0]
        except Exception as e:
            label = f"error: {e}"

    results.append({
        "filename": f"doc_{i}.pdf",
        "text": text[:1000],
        "label": label
    })

# results
df_result = pd.DataFrame(results)
df_result.to_csv("final_labeled_documents.csv", index=False)
print("Done!")


Device set to use cpu
100%|██████████| 1454/1454 [3:57:18<00:00,  9.79s/it]  


Done!


In [None]:
df1 = pd.read_csv('labeled_documents.csv')
df1['label'].value_counts()

label
professional medical organization    689
academic institution                 516
error                                190
nonprofit                             59
Name: count, dtype: int64

In [7]:
df1

Unnamed: 0,filename,text,label
0,doc_0.pdf,Data Science \nTraining Core (DSTC) \nResource...,academic institution
1,doc_1.pdf,AIM-AHEAD \nProgram for AI Readiness (PAIR)\nC...,professional medical organization
2,doc_2.pdf,2024 Al for \nHealth Equity \nSymposium \nAIM-...,academic institution
3,doc_3.pdf,NIH STRIDES Initiative\nCenter for Information...,professional medical organization
4,doc_4.pdf,Think-a-Thons\nNIH Endorsed \nCommon \nData ...,professional medical organization
...,...,...,...
1449,doc_1449.pdf,Global Multidisciplinary Perspectives Journal ...,academic institution
1450,doc_1450.pdf,Received 07/24/2024 \nReview began 08/13/2024 ...,academic institution
1451,doc_1451.pdf,The Use of Real-World Data for Personalized \n...,professional medical organization
1452,doc_1452.pdf,PROFESSIONAL DEVELOPMENT \nCOURSE CATALOG\nhim...,academic institution


In [None]:
# improt files
df_links = pd.read_csv("professional_only.csv").reset_index(drop=True)
df_labels = pd.read_csv("labeled_documents.csv").reset_index(drop=True)

# new dataset
df_new = pd.DataFrame({
    "Category": df_labels["label"],
    "PDF Link": df_links["PDF Link"]
})

# save to CSV
df_new.to_csv("final_professional.csv", index=False)


In [10]:
df_new

Unnamed: 0,Category,PDF Link
0,academic institution,https://www.aim-ahead.net/media/hd3fws1l/dstc-...
1,professional medical organization,https://www.aim-ahead.net/media/3cvd4x4g/pair-...
2,academic institution,https://www.aim-ahead.net/media/svfjvjr5/aihes...
3,professional medical organization,https://www.aim-ahead.net/media/5q3hmb5b/webin...
4,professional medical organization,https://www.aim-ahead.net/media/32ena1ts/intro...
...,...,...
1449,academic institution,https://www.multiperspectivesjournal.com/uploa...
1450,academic institution,https://www.cureus.com/articles/281861-a-revie...
1451,professional medical organization,https://www.inovigate.com/media/filer_public/6...
1452,academic institution,https://legacy.himss.org/sites/hde/files/media...


In [None]:
# Merge datasets
df_final = pd.read_csv("FINAL_dataset.csv")  # includes URL
df_statefed = pd.read_csv("cleaned_raw_pdfs.csv")  # includes 'PDF Link'
df_pro = pd.read_csv("final_professional.csv")     # includes 'PDF Link'

df_statefed = df_statefed.rename(columns={"PDF Link": "URL"})
df_pro = df_pro.rename(columns={"PDF Link": "URL"})

df_statefed_filtered = df_statefed[df_statefed["Category"].isin(["State", "Federal"])]

df_labels = pd.concat([df_statefed_filtered, df_pro], ignore_index=True)

df_labels = df_labels.drop_duplicates(subset="URL", keep="first")

df_final_merged = df_final.merge(df_labels[["URL", "Category"]], on="URL", how="left")


In [16]:
df_final_merged["Category"] = df_final_merged["Category"].str.title()
df_final_merged.to_csv("FINAL_dataset_with_Category_Capitalized.csv", index=False)

In [17]:
df_final_merged["Category"].value_counts()

Category
Professional Medical Organization    377
State                                342
Federal                              216
Academic Institution                 213
Error                                 40
Nonprofit                             19
Name: count, dtype: int64

In [27]:
df = pd.read_csv("FINAL_dataset_with_Category.csv")

In [28]:
df['Category'].value_counts()

Category
Professional Medical Organization    394
State                                342
Academic Institution                 234
Federal                              217
Nonprofit                             19
Error                                  1
Name: count, dtype: int64