In [None]:
# Suppress Hugging Face tokenizer parallelism and common warnings for cleaner output
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Advanced GenAI Features Demo
This notebook demonstrates advanced GenAI, NLP, and LLM features for recruiter-ready healthcare AI/data science portfolios.

**Features Demonstrated:**
- Entity extraction and classification with BERT/Bio_ClinicalBERT (Hugging Face Transformers)
- Retrieval-Augmented Generation (RAG) pipeline
- Vector database integration (FAISS/Chroma)
- Prompt engineering and finetuning
- Bias detection, model guardrails, and safety checks
- Cloud integration (AWS, S3, cloud ML workflows)
- PEFT/SFT advanced finetuning (Hugging Face PEFT)
Each section includes code, workflow explanation, and practical tips for production and portfolio use.

## 1. Entity Extraction & Classification with Transformers
This section demonstrates how to use BERT/Bio_ClinicalBERT and Hugging Face Transformers for entity extraction and classification in clinical text.

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load Bio_ClinicalBERT model and tokenizer
model_checkpoint = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=5)  # Example: 5 labels

labels = ['O', 'B-DISEASE', 'I-DISEASE', 'B-SYMPTOM', 'I-SYMPTOM']

def get_entities(text, model, tokenizer, labels):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    entities = []
    current_entity = None
    for token, pred in zip(tokens, predictions):
        label = labels[pred]
        if label.startswith('B-'):
            if current_entity:
                entities.append(current_entity)
            current_entity = {'entity': label[2:], 'text': token.replace('##', '')}
        elif label.startswith('I-') and current_entity:
            current_entity['text'] += token.replace('##', '')
        else:
            if current_entity:
                entities.append(current_entity)
                current_entity = None
    if current_entity:
        entities.append(current_entity)
    return entities

# Example clinical notes
notes = [
    'Patient reports chest pain and shortness of breath. History of hypertension.',
    'Diabetic patient with fatigue and nausea. No chest pain.'
 ]

for i, note in enumerate(notes):
    ents = get_entities(note, model, tokenizer, labels)
    print(f'Note {i+1}:', note)
    for ent in ents:
        print(f"  Entity: {ent['text']} | Type: {ent['entity']}")
    print()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Note 1: Patient reports chest pain and shortness of breath. History of hypertension.
  Entity: [CLS] | Type: DISEASE
  Entity: patient | Type: DISEASE
  Entity: reports | Type: SYMPTOM
  Entity: andshort | Type: SYMPTOM
  Entity: ness | Type: SYMPTOM
  Entity: of | Type: DISEASE
  Entity: breath | Type: DISEASE
  Entity: . | Type: DISEASE
  Entity: history | Type: DISEASE
  Entity: of | Type: DISEASE
  Entity: h | Type: DISEASE
  Entity: yper | Type: DISEASE
  Entity: ion | Type: DISEASE
  Entity: .[SEP] | Type: DISEASE

Note 2: Diabetic patient with fatigue and nausea. No chest pain.
  Entity: [CLS] | Type: DISEASE
  Entity: abe | Type: DISEASE
  Entity: tic | Type: DISEASE
  Entity: patient | Type: DISEASE
  Entity: with | Type: DISEASE
  Entity: fatigue | Type: DISEASE
  Entity: and | Type: DISEASE
  Entity: nausea | Type: DISEASE
  Entity: . | Type: DISEASE
  Entity: nochest | Type: SYMPTOM
  Entity: pain | Type: SYMPTOM
  Entity: .[SEP] | Type: DISEASE



## 2. Retrieval-Augmented Generation (RAG) Pipeline
This section demonstrates a simple RAG pipeline using local models and custom retrievers/generators for clinical QA.

In [2]:
# Simple RAG pipeline demo
def simple_retriever(query, docs):
    # Return the most relevant document (here, just the first for demo)
    return docs[0]

def simple_generator(text):
    # Simulate LLM answer generation
    return f"LLM answer based on: {text}"

# Example documents and query
documents = [
    "Patient 123 has diabetes and hypertension.",
    "Patient 456 has asthma and no history of diabetes."
 ]
query = "What is the diagnosis for patient 123?"

# RAG workflow
retrieved_doc = simple_retriever(query, documents)
generated_answer = simple_generator(retrieved_doc)
print("Query:", query)
print("Retrieved Document:", retrieved_doc)
print("Generated Answer:", generated_answer)

Query: What is the diagnosis for patient 123?
Retrieved Document: Patient 123 has diabetes and hypertension.
Generated Answer: LLM answer based on: Patient 123 has diabetes and hypertension.


## 3. Vector Database Integration (FAISS)
This section demonstrates how to use FAISS for semantic search and retrieval in clinical NLP workflows.

### Alternative: Vector Database Integration with Annoy
FAISS is not currently supported on Python 3.13. Annoy is a pure Python library for approximate nearest neighbor search and works with the latest Python versions. Below is a demo using Annoy for semantic search in clinical NLP workflows.

In [7]:
# Annoy vector search demo (works with Python 3.13)
# Install Annoy if not already installed
import sys
try:
    from annoy import AnnoyIndex
except ImportError:
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'annoy'])
    from annoy import AnnoyIndex
import numpy as np

# Create example embeddings (2D for demo)
embeddings = np.array([[0.1, 0.2], [0.2, 0.1], [0.9, 0.8]], dtype='float32')
f = embeddings.shape[1]
index = AnnoyIndex(f, 'euclidean')
for i, vec in enumerate(embeddings):
    index.add_item(i, vec)
index.build(10)  # 10 trees

# Query embedding
query_embedding = [0.15, 0.15]
nearest_indices = index.get_nns_by_vector(query_embedding, 2, include_distances=True)
print("Query embedding:", query_embedding)
print("Top 2 nearest indices:", nearest_indices[0])
print("Distances:", nearest_indices[1])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m9.6 MB/s[0m  [33m0:00:00[0m
[?25h  Installing build dependencies: started
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m9.6 MB/s[0m  [33m0:00:00[0m
[?25h  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'd

## 4. Prompt Engineering and Finetuning
This section demonstrates prompt engineering and basic finetuning techniques using Hugging Face Transformers.

In [9]:
# Prompt engineering demo with Hugging Face Transformers
from transformers import pipeline

# Use a fill-mask pipeline for prompt engineering
fill_mask = pipeline('fill-mask', model='bert-base-uncased')
prompt = "The patient was diagnosed with [MASK]."
results = fill_mask(prompt)
print("Prompt:", prompt)
for result in results[:3]:
    print(f"Prediction: {result['token_str']} | Score: {result['score']:.4f}")

# Finetuning demo (conceptual, not executed)
print("\nFinetuning: Use Trainer API with your labeled dataset for supervised training.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The cu

Prompt: The patient was diagnosed with [MASK].
Prediction: cancer | Score: 0.5427
Prediction: leukemia | Score: 0.0917
Prediction: schizophrenia | Score: 0.0470

Finetuning: Use Trainer API with your labeled dataset for supervised training.


## 5. Bias Detection, Model Guardrails, and Safety Checks
This section demonstrates basic bias detection and safety checks for NLP models using Python and scikit-learn.

In [None]:
# Bias detection and safety checks demo
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# Example predictions and true labels for two groups
y_true = np.array([1, 0, 1, 0, 1, 0])  # 1: Disease, 0: No Disease
y_pred_group1 = np.array([1, 0, 1, 0, 1, 0])  # Group 1 predictions
y_pred_group2 = np.array([0, 0, 1, 0, 0, 0])  # Group 2 predictions

print("Group 1 Classification Report:")
print(classification_report(y_true, y_pred_group1))

print("Group 2 Classification Report:")
print(classification_report(y_true, y_pred_group2))

# Simple bias check: Compare accuracy between groups
acc_group1 = np.mean(y_true == y_pred_group1)
acc_group2 = np.mean(y_true == y_pred_group2)
print(f"Accuracy Group 1: {acc_group1:.2f}")
print(f"Accuracy Group 2: {acc_group2:.2f}")
if abs(acc_group1 - acc_group2) > 0.2:
    print("Warning: Potential bias detected between groups!")

## 6. Cloud Integration (AWS, S3, Cloud ML Workflows)
This section demonstrates how to integrate with cloud platforms for data storage, model deployment, and ML workflows.

In [None]:
# Cloud integration demo: Upload file to AWS S3 (requires AWS credentials)
import boto3

# Example: Upload a file to S3 (conceptual, not executed)
def upload_to_s3(file_path, bucket, object_name):
    s3 = boto3.client('s3')
    try:
        s3.upload_file(file_path, bucket, object_name)
        print(f"Uploaded {file_path} to s3://{bucket}/{object_name}")
    except Exception as e:
        print("Error uploading to S3:", e)

# Example usage (commented out)
# upload_to_s3('model.pt', 'my-ml-bucket', 'models/model.pt')

print("For full cloud ML workflows, use AWS SageMaker for training/deployment.")

## 7. PEFT/SFT Advanced Finetuning
This section demonstrates parameter-efficient finetuning (PEFT/SFT) using Hugging Face PEFT library for LLMs.

In [None]:
# PEFT/SFT advanced finetuning demo (conceptual)
# Requires: pip install peft transformers datasets
from peft import get_peft_model, LoraConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load base model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Configure LoRA (Low-Rank Adaptation) for PEFT
lora_config = LoraConfig(r=8, lora_alpha=32, target_modules=["query", "value"], lora_dropout=0.1)
peft_model = get_peft_model(model, lora_config)

print("PEFT model ready for parameter-efficient finetuning.")
print("For full training, use Trainer API with your labeled dataset.")