In [1]:
%pip install --upgrade --quiet  langchain langchain_experimental langchain-openai


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import dotenv
dotenv.load_dotenv()

from langchain.prompts import FewShotPromptTemplate, PromptTemplate
# from langchain_core.pydantic_v1 import BaseModel
from pydantic import BaseModel
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI

In [12]:
class MedicalBilling(BaseModel):
    patient_id: int
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float
    
examples = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

In [13]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

In [14]:
synthetic_data_generator = create_openai_data_generator(
    output_schema=MedicalBilling,
    llm=ChatOpenAI(
        temperature=1
    ),  # You'll need to replace with your actual Language Model instance
    prompt=prompt_template,
)

In [15]:
synthetic_results = synthetic_data_generator.generate(
    subject="medical_billing",
    extra="the name must be chosen at random. Make it something you wouldn't normally choose.",
    runs=10,
)

In [16]:
synthetic_results

[MedicalBilling(patient_id=987654, patient_name='Samantha White', diagnosis_code='C72.9', procedure_code='99204', total_charge=450.0, insurance_claim_amount=380.0),
 MedicalBilling(patient_id=123456, patient_name='Ezekiel Black', diagnosis_code='R07.9', procedure_code='99203', total_charge=200.0, insurance_claim_amount=160.0),
 MedicalBilling(patient_id=789012, patient_name='Harper Smith', diagnosis_code='I50.9', procedure_code='99213', total_charge=350.0, insurance_claim_amount=300.0),
 MedicalBilling(patient_id=456789, patient_name='Maverick Johnson', diagnosis_code='F32.9', procedure_code='99205', total_charge=500.0, insurance_claim_amount=420.0),
 MedicalBilling(patient_id=987654, patient_name='Amara Singh', diagnosis_code='M54.5', procedure_code='99204', total_charge=275.0, insurance_claim_amount=220.0),
 MedicalBilling(patient_id=123456, patient_name='Beckett Thompson', diagnosis_code='G41.9', procedure_code='99214', total_charge=400.0, insurance_claim_amount=350.0),
 MedicalBill

In [17]:
for result in synthetic_results[0]:
    print(result)

('patient_id', 987654)
('patient_name', 'Samantha White')
('diagnosis_code', 'C72.9')
('procedure_code', '99204')
('total_charge', 450.0)
('insurance_claim_amount', 380.0)


In [19]:
import csv
from typing import Type, List

def basemodel_to_csv(model: Type[BaseModel], data: List[BaseModel], filename: str):
    # Get field names from the model
    field_names = list(model.__fields__.keys())
    
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=field_names)
        
        # Write the header
        writer.writeheader()
        
        # Write the data
        for record in data:
            row = {}
            for field in field_names:
                value = getattr(record, field)
                if isinstance(value, list):
                    row[field] = ', '.join(map(str, value))
                else:
                    row[field] = str(value)
            writer.writerow(row)

# Example
basemodel_to_csv(MedicalBilling, synthetic_results, 'medical_billing_records.csv')

C:\Users\ASUS\AppData\Local\Temp\ipykernel_52060\3835672104.py:6: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  field_names = list(model.__fields__.keys())


In [9]:
from langchain_experimental.synthetic_data import (
    DatasetGenerator,
    create_data_generation_chain,
)
from langchain_openai import ChatOpenAI

# LLM
model = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)
chain = create_data_generation_chain(model)

In [10]:
chain({"fields": ["blue", "yellow"], "preferences": {}})



  chain({"fields": ["blue", "yellow"], "preferences": {}})


{'fields': ['blue', 'yellow'],
 'preferences': {},
 'text': 'The blue sky was painted with streaks of yellow as the sun began to set, casting a warm and vibrant glow over the tranquil landscape.'}

In [11]:
chain(
    {
        "fields": {"colors": ["blue", "yellow"]},
        "preferences": {"style": "Make it in a style of a weather forecast."},
    }
)

{'fields': {'colors': ['blue', 'yellow']},
 'preferences': {'style': 'Make it in a style of a weather forecast.'},
 'text': "Today's forecast calls for a mix of blue and yellow hues in the sky, creating a stunning and vibrant contrast that will surely catch the eye of all who look up."}

In [12]:
chain(
    {
        "fields": {"actor": "Tom Hanks", "movies": ["Forrest Gump", "Green Mile"]},
        "preferences": None,
    }
)

{'fields': {'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},
 'preferences': None,
 'text': 'Tom Hanks, known for his iconic roles in films such as "Forrest Gump" and "Green Mile," has captivated audiences worldwide with his versatile acting skills and undeniable charm.'}

In [13]:
chain(
    {
        "fields": [
            {"actor": "Tom Hanks", "movies": ["Forrest Gump", "Green Mile"]},
            {"actor": "Mads Mikkelsen", "movies": ["Hannibal", "Another round"]},
        ],
        "preferences": {"minimum_length": 200, "style": "gossip"},
    }
)

{'fields': [{'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},
  {'actor': 'Mads Mikkelsen', 'movies': ['Hannibal', 'Another round']}],
 'preferences': {'minimum_length': 200, 'style': 'gossip'},
 'text': 'Rumor has it that Tom Hanks, known for his iconic roles in movies such as "Forrest Gump" and "Green Mile", may be teaming up with the talented Mads Mikkelsen, famous for his chilling performances in "Hannibal" and "Another round", for an upcoming blockbuster film that is sure to captivate audiences worldwide.'}

In [14]:
inp = [
    {
        "Actor": "Tom Hanks",
        "Film": [
            "Forrest Gump",
            "Saving Private Ryan",
            "The Green Mile",
            "Toy Story",
            "Catch Me If You Can",
        ],
    },
    {
        "Actor": "Tom Hardy",
        "Film": [
            "Inception",
            "The Dark Knight Rises",
            "Mad Max: Fury Road",
            "The Revenant",
            "Dunkirk",
        ],
    },
]

generator = DatasetGenerator(model, {"style": "informal", "minimal length": 500})
dataset = generator(inp)

In [15]:
dataset

[{'fields': {'Actor': 'Tom Hanks',
   'Film': ['Forrest Gump',
    'Saving Private Ryan',
    'The Green Mile',
    'Toy Story',
    'Catch Me If You Can']},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Tom Hanks is a versatile actor known for his iconic roles in films such as "Forrest Gump," "Saving Private Ryan," "The Green Mile," "Toy Story," and "Catch Me If You Can," showcasing his talent across various genres and captivating audiences with his genuine performances.'},
 {'fields': {'Actor': 'Tom Hardy',
   'Film': ['Inception',
    'The Dark Knight Rises',
    'Mad Max: Fury Road',
    'The Revenant',
    'Dunkirk']},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Tom Hardy, known for his versatility and intensity, has delivered gripping performances in a range of films such as "Inception," "The Dark Knight Rises," "Mad Max: Fury Road," "The Revenant," and "Dunkirk," solidifying his reputation as one of the most talented acto

Extraction

In [16]:
from typing import List

from langchain.chains import create_extraction_chain_pydantic
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from pydantic import BaseModel, Field

In [17]:
class Actor(BaseModel):
    Actor: str = Field(description="name of an actor")
    Film: List[str] = Field(description="list of names of films they starred in")

In [18]:
llm = OpenAI()
parser = PydanticOutputParser(pydantic_object=Actor)

prompt = PromptTemplate(
    template="Extract fields from a given text.\n{format_instructions}\n{text}\n",
    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(text=dataset[0]["text"])
output = llm(_input.to_string())

parsed = parser.parse(output)
parsed

  output = llm(_input.to_string())


Actor(Actor='Tom Hanks', Film=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Toy Story', 'Catch Me If You Can'])

In [19]:
(parsed.Actor == inp[0]["Actor"]) & (parsed.Film == inp[0]["Film"])

True

In [20]:
extractor = create_extraction_chain_pydantic(pydantic_schema=Actor, llm=model)
extracted = extractor.run(dataset[1]["text"])
extracted

  extractor = create_extraction_chain_pydantic(pydantic_schema=Actor, llm=model)
  extracted = extractor.run(dataset[1]["text"])


[Actor(Actor='Tom Hardy', Film=['Inception', 'The Dark Knight Rises', 'Mad Max: Fury Road', 'The Revenant', 'Dunkirk'])]

In [21]:
(extracted[0].Actor == inp[1]["Actor"]) & (extracted[0].Film == inp[1]["Film"])

True