In [None]:
import os
import pdfplumber
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List, Optional

# 1. Read Crypto White Paper PDFs

In [None]:
%store -r pdf_file

In [None]:
def extract_text_from_pdf(pdf_file):

    text = ""
    try:
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        # print(f"Error reading PDF with pdfplumber: {e}")
        # return None
        raise RuntimeError(f"Failed to read PDF file: {e}")
    return text

# path = "whitepapers"
# pdf_file = "whitepaper_abjcoin.pdf"  
# file_path = os.path.join(path, pdf_file)
extracted_text = extract_text_from_pdf(pdf_file)

In [None]:
if extracted_text:
    document = Document(page_content = extracted_text, metadata = {"source": pdf_file})
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200, length_function = len, separators = ["\n\n", "\n", " ", ""])
    chunks = text_splitter.split_documents([document])

    for i, chunk in enumerate(chunks[:5]):
        print(f"\n--- Chunk {i+1} ---")
        print(f"{chunk.page_content[:100]}...")
        print(f"Metadata: {chunk.metadata}\n")
        print(chunk.page_content, "\n")
        print("-" * 30)
else:
    raise ValueError("No text extracted from the PDF file.")

# 2. Set Up LLM

In [71]:
load_dotenv()

MODEL = "gpt-4o-mini"
AZURE_ENDPOINT = os.getenv("AZURE_ENDPOINT")
API_VERSION = os.getenv("API_VERSION")
API_KEY = os.getenv("API_KEY")

if not AZURE_ENDPOINT or not API_KEY:
    raise ValueError("AZURE_ENDPOINT and API_KEY environment variables must be set.")

In [72]:
llm = AzureChatOpenAI(
    azure_deployment = MODEL,
    openai_api_version = API_VERSION,
    azure_endpoint = AZURE_ENDPOINT,
    openai_api_key = API_KEY,
    temperature = 0 # For consistent structured output
)

In [75]:
# define Pydantic Schemas

class CryptoProjectInfo(BaseModel):
    project_name: Optional[str] = Field(default = None, description = "Full name of the cryptocurrency project.")
    ticker_symbol: Optional[str] = Field(default = None, description = "Abbreviated ticker symbol (e.g., BTC, ETH, ABJC).")
    consensus_mechanism: Optional[str] = Field(default = None, description = "Consensus mechanism used (e.g., 'Proof of Work', 'Proof of Stake').")
    key_features: List[str] = Field(default_factory = list, description = "Unique features or innovations of the project.")

class Tokenomics(BaseModel):
    token_description: str = Field(default = None, description = "Description of the token")
    total_supply: Optional[str] = Field(default = None, description = "How tokens are distributed.")
    utility: Optional[str] = Field(default = None, description = "Primary use cases or utility of the token.")

class TeamMember(BaseModel):
    name: str = Field(default = None, description = "Names of the team member.")
    role: Optional[str] = Field(default = None, description = "Role or title of the team member.")
    responsibilities: Optional[str] = Field(default = None, description = "Key responsibilities or contributions of the team member.")

In [None]:
def extract_info_from_chunk(chunk, output_schema):
    extraction_chain = llm.with_structured_output(output_schema)

    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert at extracting structured information from cryptocurrency whitepapers. Extract the following details about the project mentioned in the text. Focus only on information directly present."),
        ("human", "Extract information about the crypto project from the following text:\n\n{text}")
    ])

    chain_with_prompt = prompt | extraction_chain

    extracted_data_list = []
    for i, chunk in enumerate(chunks):
        try:
            # invoke the LangChain structured output chain
            extracted_info = chain_with_prompt.invoke({"text": chunk.page_content})
            # extracted_data_list.append(extracted_info.dict()) 
            extracted_data_list.append(extracted_info.model_dump()) # convert Pydantic model to dictionary
        except Exception as e:
            print(f"Error extracting from chunk {i+1}: {e}")
            # Log the full error for debugging if needed

    return extracted_data_list

In [77]:
final_output = dict()

final_project_info = CryptoProjectInfo()
extracted_data_list = extract_info_from_chunk(chunk, CryptoProjectInfo)

for data in extracted_data_list:
    current_pydantic_model = CryptoProjectInfo.model_validate(data)

    # merge results
    if current_pydantic_model.project_name and not final_project_info.project_name:
        final_project_info.project_name = current_pydantic_model.project_name
    if current_pydantic_model.ticker_symbol and not final_project_info.ticker_symbol:
        final_project_info.ticker_symbol = current_pydantic_model.ticker_symbol
    if current_pydantic_model.consensus_mechanism and not final_project_info.consensus_mechanism:
        final_project_info.consensus_mechanism = current_pydantic_model.consensus_mechanism
    
    # merge results from lists
    for feature in current_pydantic_model.key_features:
        if feature not in final_project_info.key_features:
            final_project_info.key_features.append(feature)

final_output.update(final_project_info.model_dump())

In [78]:
final_tokenomics_info = Tokenomics()
extracted_data_list2 = extract_info_from_chunk(chunk, Tokenomics)

for data in extracted_data_list2:
    current_pydantic_model = Tokenomics.model_validate(data)

    # merge results
    if current_pydantic_model.token_description and not final_tokenomics_info.token_description:
        final_tokenomics_info.token_description = current_pydantic_model.token_description
    if current_pydantic_model.total_supply and not final_tokenomics_info.total_supply:
        final_tokenomics_info.total_supply = current_pydantic_model.total_supply
    if current_pydantic_model.utility and not final_tokenomics_info.utility:
        final_tokenomics_info.utility = current_pydantic_model.utility
        
final_output.update(final_tokenomics_info.model_dump())

In [79]:
final_team_members = TeamMember()
extracted_data_list3 = extract_info_from_chunk(chunk, TeamMember)

for data in extracted_data_list3:
    current_pydantic_model = TeamMember.model_validate(data)

    # merge results
    if current_pydantic_model.name and not final_team_members.name:
        final_team_members.name = current_pydantic_model.name
    if current_pydantic_model.role and not final_team_members.role:
        final_team_members.role = current_pydantic_model.role
    if current_pydantic_model.responsibilities and not final_team_members.responsibilities:
        final_team_members.responsibilities = current_pydantic_model.responsibilities
        
final_output.update(final_team_members.model_dump())

In [None]:
%store final_output