In [3]:
import os
import pandas as pd
from pdfminer.high_level import extract_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

CATEGORIES = {
    "Whitepaper": ["whitepaper", "protocol", "consensus", "blockchain"],
    "Research": ["research", "study", "academic", "analysis"],
    "Regulatory": ["compliance", "regulation", "legal", "SEC", "AML", "KYC"],
    "Project Documentation": ["API", "developer", "documentation", "smart contract"],
    "Exchange Reports": ["trading", "market", "liquidity", "exchange"],
    "ICO Documents": ["ICO", "IEO", "STO", "tokenomics", "fundraising"],
    "NFT Reports": ["NFT", "non-fungible", "metaverse", "digital art"],
    "Security Reports": ["security", "audit", "vulnerability", "hack"],
    "DeFi Reports": ["DeFi", "liquidity pool", "yield farming"],
    "Taxation Reports": ["tax", "accounting", "audit", "IRS"],
}

CATEGORY_LABELS = list(CATEGORIES.keys())

def extract_text_from_pdf(pdf_path):
    try:
        text = extract_text(pdf_path)
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""

def train_model():
    data = []
    labels = []
    
    for category, keywords in CATEGORIES.items():
        for keyword in keywords:
            data.append(keyword)
            labels.append(category)
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data)
    model = LogisticRegression()
    model.fit(X, labels)
    
    joblib.dump((vectorizer, model), "text_classifier.pkl")
    print("Model trained and saved.")

def load_model():
    return joblib.load("text_classifier.pkl")

def classify_text(text):
    if not text:
        return "Unknown"
    
    vectorizer, model = load_model()
    X = vectorizer.transform([text])
    return model.predict(X)[0]

def classify_pdfs_in_folder(folder_path):
    data = []
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file_name)
            content = extract_text_from_pdf(pdf_path)
            category = classify_text(content)
            data.append([file_name, content, category])
    
    df = pd.DataFrame(data, columns=["File Name", "Content", "Category"])
    print(df)
    return df


if not os.path.exists("text_classifier.pkl"):
    train_model()
folder_path = "downloaded_pdfs"  
df=classify_pdfs_in_folder(folder_path)

The PDF <_io.BufferedReader name='downloaded_pdfs\\FDUSD-Whitepaper-25216064ca0cc8.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


                                             File Name  \
0         0100258fa8b4509511de397d0a727800a4bc0e68.pdf   
1         01ca8da6bc1ed65c62f953f91407fc041a9766ab.pdf   
2         022636bb98491e153625895419bdfda50f2b5219.pdf   
3         0275b8b9a756586537888872446855594bc59e68.pdf   
4         02b2a7b96372e79cf822ae9a60129789a4efd6c4.pdf   
..                                                 ...   
762                         wpiea2023144-print-pdf.pdf   
763                                   yellow-paper.pdf   
764                         YGG-Whitepaper-English.pdf   
765  zkEVM_and_the_Future_of_Ethereum_Scaling_Stefa...   
766                              zkLink-whitepaper.pdf   

                                               Content               Category  
0    Protocol Reporting\nApril 26, 2023\nMATIC\n\nS...             Whitepaper  
1    Protocol Reporting\nMay 31, 2023\nStacks (STX)...             Whitepaper  
2    December 1, 2024\n\nVenus, (XVS)\n\nQuarterly ...         

In [4]:
df["Category"].value_counts()

Category
Whitepaper               242
Exchange Reports         238
Security Reports          65
NFT Reports               49
Project Documentation     48
Research                  44
Regulatory                40
DeFi Reports              20
Unknown                   14
ICO Documents              6
Taxation Reports           1
Name: count, dtype: int64