In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

In [2]:
class ExtractiveQA:
    def __init__(self, model_name="bert-large-uncased-whole-word-masking-finetuned-squad"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        
    def get_answer(self, question, context):
        # Tokenize input text
        inputs = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            return_tensors="pt"
        )
        
        # Get model predictions
        with torch.no_grad():
            outputs = self.model(**inputs)
            answer_start = torch.argmax(outputs.start_logits)
            answer_end = torch.argmax(outputs.end_logits)
            
        # Convert token positions to character positions
        tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        answer = tokens[answer_start:answer_end + 1]
        
        # Clean up answer by removing special tokens and combining wordpieces
        cleaned_answer = self.tokenizer.convert_tokens_to_string(answer)
        
        # Get confidence scores
        start_confidence = torch.max(torch.softmax(outputs.start_logits, dim=1)).item()
        end_confidence = torch.max(torch.softmax(outputs.end_logits, dim=1)).item()
        confidence = (start_confidence + end_confidence) / 2
        
        return {
            "answer": cleaned_answer,
            "confidence": confidence,
            "start_position": answer_start.item(),
            "end_position": answer_end.item()
        }

In [4]:
context = """
The Python programming language was created by Guido van Rossum and was released in 1991. 
Python is known for its simple syntax and readability. It has become one of the most popular 
programming languages for data science, machine learning, and web development.
"""

question1 = "Who created Python?"
question2 = "What is Python best used for?"

# Initialize QA system
qa_system = ExtractiveQA()

# Get answer 1
result1 = qa_system.get_answer(question1, context)
print(f"Question: {question1}")
print(f"Answer: {result1['answer']}")
print(f"Confidence: {result1['confidence']:.2%}")

# Get answer 2
result2 = qa_system.get_answer(question2, context)
print(f"Question: {question2}")
print(f"Answer: {result2['answer']}")
print(f"Confidence: {result2['confidence']:.2%}")



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Question: Who created Python?
Answer: guido van rossum
Confidence: 99.91%
Question: What is Python best used for?
Answer: data science, machine learning, and web development
Confidence: 91.54%


# Test on Chunks

In [11]:
summary = """
Summary:
Air pollution promotes lung cancer by inducing inflammation and expanding pre-existing oncogenic mutations. Particulate matter (PM2.5) exposure correlates with increased EGFR-driven lung cancer incidence across countries. PM2.5 triggers macrophage-derived interleukin-1β release, promoting a progenitor-like state in EGFR-mutant lung cells and accelerating tumor formation. Oncogenic EGFR and KRAS mutations were found in 18% and 53% of healthy lung samples, respectively, suggesting air pollutants may promote expansion of existing mutant clones rather than directly causing mutations.
"""

chunk_caps = """
Summary:
Air pollution promotes lung cancer by inducing inflammation and expanding pre-existing oncogenic mutations. Particulate matter (PM2.5) exposure correlates with increased EGFR-driven lung cancer incidence across countries. PM2.5 triggers macrophage-derived interleukin-1β release, promoting a progenitor-like state in EGFR-mutant lung cells and accelerating tumor formation. Oncogenic EGFR and KRAS mutations were found in 18% and 53% of healthy lung samples, respectively, suggesting air pollutants may promote expansion of existing mutant clones rather than directly causing mutations.

Annotations:
Text - GFR 
Type - Gene
NCBI Gene - 13649
Text - RAS 
Type - Gene
NCBI Gene - 16653
Text - mouse
Type - Species
NCBI Taxonomy - 10090
Text - EGFR
Type - Gene
NCBI Gene - 13649
Text - KRAS
Type - Gene
NCBI Gene - 16653

Text:
A complete understanding of how exposure to environmental substances promotes cancer formation is lacking. More than 70 years ago, tumorigenesis was proposed to occur in a two-step process: an initiating step that induces mutations in healthy cells, followed by a promoter step that triggers cancer development1. Here we propose that environmental particulate matter measuring ≤2.5 μm (PM2.5), known to be associated with lung cancer risk, promotes lung cancer by acting on cells that harbour pre-existing oncogenic mutations in healthy lung tissue. Focusing on EGFR-driven lung cancer, which is more common in never-smokers or light smokers, we found a significant association between PM2.5 levels and the incidence of lung cancer for 32,957 EGFR driven lung cancer cases in four within-country cohorts. Functional mouse models revealed that air pollutants cause an influx of macrophages into the lung and release of interleukin-1β. This process results in a progenitor-like cell state within EGFR mutant lung alveolar type II epithelial cells that fuels tumorigenesis. Ultradeep mutational profiling of histologically normal lung tissue from 295 individuals across 3 clinical cohorts revealed oncogenic EGFR and KRAS driver mutations in 18% and 53% of healthy tissue samples, respectively. These findings collectively support a tumour promoting role for PM2.5 air pollutants and provide impetus for public health policy initiatives to address air pollution to reduce disease burden.      
"""

question1 = "lung cancer and air pollution"
question2 = "EGFR mutation frequency healthy lung tissue"

# Initialize QA system
qa_system = ExtractiveQA()

# Get answer 1
result1 = qa_system.get_answer(question1, chunk_caps)
print(f"Question: {question1}")
print(f"Answer: {result1['answer']}")
print(f"Confidence: {result1['confidence']:.2%}")

# Get answer 2
result2 = qa_system.get_answer(question2, chunk_caps)
print(f"Question: {question2}")
print(f"Answer: {result2['answer']}")
print(f"Confidence: {result2['confidence']:.2%}")



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (525) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
import xml.etree.ElementTree as ET
from typing import List, Dict, Generator, Tuple
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class BioCXMLParser:
    """Parser for BioC XML files with chunking capabilities"""
    
    def __init__(self, chunk_size: int = 512):
        self.chunk_size = chunk_size
    
    def parse_bioc_file(self, file_path: str) -> Generator[Dict, None, None]:
        """Parse BioC XML file and yield documents with their annotations"""
        for event, elem in ET.iterparse(file_path, events=("start", "end")):
            if event == "end" and elem.tag == "document":
                doc = self._process_document(elem)
                yield doc
                elem.clear()  # Clear memory
    
    def _process_document(self, doc_elem) -> Dict:
        """Process a single document element"""
        passages = []
        annotations = []
        
        # Extract passages and their annotations
        for passage in doc_elem.findall(".//passage"):
            text = passage.find("text").text
            passage_annotations = []
            
            for annotation in passage.findall(".//annotation"):
                anno_text = annotation.find("text").text
                locations = [(loc.get("offset"), loc.get("length")) 
                           for loc in annotation.findall("location")]
                passage_annotations.append({
                    "text": anno_text,
                    "locations": locations
                })
            
            passages.append(text)
            annotations.extend(passage_annotations)
        
        return {
            "id": doc_elem.find("id").text,
            "passages": passages,
            "annotations": annotations
        }
    
    def chunk_text(self, text: str, overlap: int = 100) -> List[str]:
        """Create overlapping chunks of text"""
        chunks = []
        start = 0
        
        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            
            # Try to end at a sentence boundary
            if end < len(text):
                # Look for sentence endings (.!?) followed by space
                for i in range(end-1, max(start, end-50), -1):
                    if text[i] in ".!?" and (i+1 >= len(text) or text[i+1].isspace()):
                        end = i + 1
                        break
            
            chunks.append(text[start:end])
            start = end - overlap
            
        return chunks

class BiomedicalQA:
    """Biomedical Question Answering System with semantic search"""
    
    def __init__(self, 
                 qa_model_name: str = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
                 embedding_model_name: str = "pritamdeka/S-PubMedBert-MS-MARCO"):
        # QA model for answer extraction
        self.qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
        self.qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
        
        # Embedding model for semantic search
        self.embedding_model = SentenceTransformer(embedding_model_name)
        
        # Initialize parser
        self.parser = BioCXMLParser()
    
    def get_relevant_chunks(self, 
                          question: str, 
                          chunks: List[str], 
                          top_k: int = 3) -> List[Tuple[str, float]]:
        """Find most relevant chunks using semantic search"""
        # Get embeddings
        question_embedding = self.embedding_model.encode([question])[0]
        chunk_embeddings = self.embedding_model.encode(chunks)
        
        # Calculate similarities
        similarities = cosine_similarity([question_embedding], chunk_embeddings)[0]
        
        # Get top-k chunks
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        return [(chunks[i], similarities[i]) for i in top_indices]
    
    def get_answer(self, question: str, context: str) -> Dict:
        """Extract answer from context"""
        inputs = self.qa_tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            return_tensors="pt",
            max_length=512,
            truncation=True
        )
        
        with torch.no_grad():
            outputs = self.qa_model(**inputs)
            start_scores = outputs.start_logits
            end_scores = outputs.end_logits
            
            start_idx = torch.argmax(start_scores)
            end_idx = torch.argmax(end_scores)
            
            tokens = self.qa_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
            answer_tokens = tokens[start_idx:end_idx + 1]
            answer = self.qa_tokenizer.convert_tokens_to_string(answer_tokens)
            
            confidence = (torch.max(torch.softmax(start_scores, dim=1)) + 
                        torch.max(torch.softmax(end_scores, dim=1))).item() / 2
            
        return {
            "answer": answer,
            "confidence": confidence,
            "context": context
        }
    
    def process_file(self, file_path: str, question: str) -> List[Dict]:
        """Process entire BioC XML file and find answers"""
        answers = []
        
        for doc in self.parser.parse_bioc_file(file_path):
            # Combine passages into single text
            full_text = " ".join(doc["passages"])
            
            # Create chunks
            chunks = self.parser.chunk_text(full_text)
            
            # Get relevant chunks
            relevant_chunks = self.get_relevant_chunks(question, chunks)
            
            # Get answers from relevant chunks
            for chunk, similarity in relevant_chunks:
                answer = self.get_answer(question, chunk)
                answer["document_id"] = doc["id"]
                answer["semantic_similarity"] = similarity
                answers.append(answer)
        
        # Sort by combined confidence and semantic similarity
        answers.sort(key=lambda x: (x["confidence"] + x["semantic_similarity"]) / 2, reverse=True)
        return answers

def main():
    # Example usage
    qa_system = BiomedicalQA()
    
    # Process a BioC XML file
    file_path = "path/to/your/bioc_file.xml"
    question = "What are the side effects of aspirin?"
    
    answers = qa_system.process_file(file_path, question)
    
    # Print top answer
    if answers:
        top_answer = answers[0]
        print(f"Question: {question}")
        print(f"Answer: {top_answer['answer']}")
        print(f"Confidence: {top_answer['confidence']:.2%}")
        print(f"Semantic Similarity: {top_answer['semantic_similarity']:.2%}")
        print(f"Document ID: {top_answer['document_id']}")
    else:
        print("No answers found.")

if __name__ == "__main__":
    main()

In [10]:
from typing import Dict, Tuple, List
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from IPython.display import HTML
import re

class EnhancedBiomedicalQA:
    def __init__(self, model_name: str = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        
        # Get all special tokens
        self.special_token_ids = set(self.tokenizer.all_special_ids)
        
        # Dictionary of special tokens and their meanings
        self.special_tokens = {
            '[CLS]': 'Classification token (start of sequence)',
            '[SEP]': 'Separator token (end of sequence/between sequences)',
            '[PAD]': 'Padding token',
            '[UNK]': 'Unknown token',
            '[MASK]': 'Masked token for MLM tasks'
        }
    
    def is_special_token_id(self, token_id: int) -> bool:
        """Check if a token ID is a special token"""
        return token_id in self.special_token_ids
    
    def clean_answer(self, answer: str) -> str:
        """Remove special tokens and clean up the answer"""
        # Remove all special tokens
        for special_token in self.special_tokens.keys():
            answer = answer.replace(special_token, '')
        
        # Clean up whitespace
        answer = re.sub(r'\s+', ' ', answer).strip()
        return answer
    
    def get_answer_with_constraints(self, 
                                  question: str, 
                                  context: str,
                                  min_length: int = 10,
                                  max_length: int = 100,
                                  min_confidence: float = 0.2) -> Dict:
        """Get answer with length constraints and confidence threshold"""
        # Tokenize input
        inputs = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True,
            return_tensors="pt",
            return_offsets_mapping=True,
            max_length=512,
            truncation=True
        )
        
        # Get token offsets for mapping back to original text
        offset_mapping = inputs.pop("offset_mapping")[0]
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            
            # Get probability distributions
            start_probs = torch.softmax(outputs.start_logits, dim=1)[0]
            end_probs = torch.softmax(outputs.end_logits, dim=1)[0]
            
            # Find valid start-end pairs
            valid_answers = []
            for start_idx in range(len(start_probs)):
                for end_idx in range(start_idx, min(len(end_probs), start_idx + max_length)):
                    # Skip special tokens
                    if self.is_special_token_id(inputs['input_ids'][0][start_idx].item()) or \
                       self.is_special_token_id(inputs['input_ids'][0][end_idx].item()):
                        continue
                    
                    # Calculate score for this span
                    score = start_probs[start_idx] * end_probs[end_idx]
                    
                    # Get answer text
                    answer_tokens = inputs['input_ids'][0][start_idx:end_idx + 1]
                    answer_text = self.tokenizer.decode(answer_tokens, skip_special_tokens=True)
                    cleaned_answer = self.clean_answer(answer_text)
                    
                    # Check constraints
                    if len(cleaned_answer.split()) >= min_length / 5 and \
                       score >= min_confidence and \
                       cleaned_answer:
                        # Get character positions in original text
                        char_start = offset_mapping[start_idx][0].item()
                        char_end = offset_mapping[end_idx][1].item()
                        
                        valid_answers.append({
                            'answer': cleaned_answer,
                            'confidence': score.item(),
                            'char_start': char_start,
                            'char_end': char_end
                        })
            
            # Sort by confidence and get best answer
            valid_answers.sort(key=lambda x: x['confidence'], reverse=True)
            
            if not valid_answers:
                return {
                    'answer': '',
                    'confidence': 0.0,
                    'char_start': -1,
                    'char_end': -1,
                    'error': 'No valid answer found meeting constraints'
                }
            
            return valid_answers[0]
    
    def highlight_answer(self, context: str, char_start: int, char_end: int) -> str:
        """Create HTML with highlighted answer in context"""
        if char_start < 0 or char_end < 0:
            return context
        
        highlighted = (
            context[:char_start] +
            f'<span style="background-color: yellow">{context[char_start:char_end]}</span>' +
            context[char_end:]
        )
        return highlighted

def main():
    # Example usage
    qa = EnhancedBiomedicalQA()
    
    # Example context and question
    context = """
Annotations:
Text - GFR 
Type - Gene
NCBI Gene - 13649
Text - RAS 
Type - Gene
NCBI Gene - 16653
Text - mouse
Type - Species
NCBI Taxonomy - 10090
Text - EGFR
Type - Gene
NCBI Gene - 13649
Text - KRAS
Type - Gene
NCBI Gene - 16653

Text:
A complete understanding of how exposure to environmental substances promotes cancer formation is lacking. More than 70 years ago, tumorigenesis was proposed to occur in a two-step process: an initiating step that induces mutations in healthy cells, followed by a promoter step that triggers cancer development1. Here we propose that environmental particulate matter measuring ≤2.5 μm (PM2.5), known to be associated with lung cancer risk, promotes lung cancer by acting on cells that harbour pre-existing oncogenic mutations in healthy lung tissue. Focusing on EGFR-driven lung cancer, which is more common in never-smokers or light smokers, we found a significant association between PM2.5 levels and the incidence of lung cancer for 32,957 EGFR driven lung cancer cases in four within-country cohorts. Functional mouse models revealed that air pollutants cause an influx of macrophages into the lung and release of interleukin-1β. This process results in a progenitor-like cell state within EGFR mutant lung alveolar type II epithelial cells that fuels tumorigenesis. Ultradeep mutational profiling of histologically normal lung tissue from 295 individuals across 3 clinical cohorts revealed oncogenic EGFR and KRAS driver mutations in 18% and 53% of healthy tissue samples, respectively. These findings collectively support a tumour promoting role for PM2.5 air pollutants and provide impetus for public health policy initiatives to address air pollution to reduce disease burden.      
    """
    
    question = "EGFR mutation frequency healthy lung tissue"
    
    # Get answer with constraints
    result = qa.get_answer_with_constraints(
        question=question,
        context=context,
        min_length=10,  # Minimum 10 characters
        max_length=100,  # Maximum 100 characters
        min_confidence=0.2  # Minimum 20% confidence
    )
    
    print(f"Question: {question}")
    print(f"Answer: {result['answer']}")
    print(f"Confidence: {result['confidence']:.2%}")
    
    if result['char_start'] >= 0:
        print("\nHighlighted context:")
        print(result['char_start'], result['char_end'])
        highlighted = qa.highlight_answer(
            context=context,
            char_start=result['char_start'],
            char_end=result['char_end']
        )
        print(highlighted)

main()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Question: EGFR mutation frequency healthy lung tissue
Answer: 
Confidence: 0.00%
