In [None]:
import os
import fitz  # PyMuPDF
import shutil
import pandas as pd
import numpy as np
import nltk
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer

# Ensure nltk stopwords are downloaded
nltk.download("stopwords")

# Load a pre-trained BERT model for embeddings
bert_model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight and efficient

# Define categories (for reference)
CATEGORIES = {
    "Whitepaper": ["whitepaper", "protocol", "consensus", "blockchain"],
    "Research": ["research", "study", "academic", "analysis"],
    "Regulatory": ["compliance", "regulation", "legal", "SEC", "AML", "KYC"],
    "Project Documentation": ["API", "developer", "documentation", "smart contract"],
    "Exchange Reports": ["trading", "market", "liquidity", "exchange"],
    "ICO Documents": ["ICO", "IEO", "STO", "tokenomics", "fundraising"],
    "NFT Reports": ["NFT", "non-fungible", "metaverse", "digital art"],
    "Security Reports": ["security", "audit", "vulnerability", "hack"],
    "DeFi Reports": ["DeFi", "liquidity pool", "yield farming"],
    "Taxation Reports": ["tax", "accounting", "audit", "IRS"],
}

# Function to extract text from a PDF
def extract_text(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text("text") + " "
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text.strip()

# Load PDFs and extract text
pdf_folder = "downloaded_pdfs"  # Change this to your folder path
data = []

for filename in tqdm(os.listdir(pdf_folder), desc="Extracting PDFs"):
    if filename.endswith(".pdf"):
        filepath = os.path.join(pdf_folder, filename)
        text = extract_text(filepath)
        if text:
            data.append({"filename": filename, "text": text})

df = pd.DataFrame(data)

# Manually label a small sample for training
training_data = [
    ("whitepaper-v3.pdf", "Whitepaper"),
    ("Cryptex_-_Staking_Report.pdf", "Report"),
    ("guide-to-regulation-on-cryptocurrency-and-digital-token.pdf", "Regulatory"),
    ("state-of-nft-marketplaces.pdf", "NFT Reports"),
]

train_df = pd.DataFrame(training_data, columns=["filename", "category"])
train_df["text"] = train_df["filename"].apply(lambda x: extract_text(os.path.join(pdf_folder, x)))

# Encode labels
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["category"])

# Convert text to BERT embeddings
train_embeddings = np.array(bert_model.encode(train_df["text"].tolist(), convert_to_numpy=True))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(train_embeddings, train_df["label"], test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# Predict categories for all PDFs
df["embeddings"] = df["text"].apply(lambda x: bert_model.encode(x, convert_to_numpy=True))
df["label"] = df["embeddings"].apply(lambda x: model.predict([x])[0])
df["category"] = label_encoder.inverse_transform(df["label"])

# Organize PDFs into categorized folders
output_folder = "classified_pdfs"
os.makedirs(output_folder, exist_ok=True)

for category in df["category"].unique():
    category_path = os.path.join(output_folder, category)
    os.makedirs(category_path, exist_ok=True)

for _, row in df.iterrows():
    src_path = os.path.join(pdf_folder, row["filename"])
    dst_path = os.path.join(output_folder, row["category"], row["filename"])
    shutil.move(src_path, dst_path)

print("Classification completed! PDFs are organized in 'classified_pdfs'.")


  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package stopwords to C:\Users\Mega
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Extracting PDFs:   7%|▋         | 51/770 [00:04<01:55,  6.23it/s]

MuPDF error: syntax error: could not parse color space (252 0 R)



Extracting PDFs: 100%|██████████| 770/770 [01:19<00:00,  9.72it/s]


Accuracy: 0.00
