In [None]:
!pip install -qU langchain langchain_experimental langchain-openai

# Generate Synthetic Data

Synthetic data is used to simulate real data without compromising privacy or encountering real-world limitations. However, synthetic data should be used carefully, as it may not always capture real-world complexities.

## Setup

In [None]:
import os

langchain_api_key = 'your_langchain_api_key_here'  # Replace with your actual LangChain API key
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key

openai_api_key = 'your_openai_api_key_here'  # Replace with your actual OpenAI API key
os.environ['OPENAI_API_KEY'] = openai_api_key

In [3]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI
from pydantic import BaseModel

## 1.Define Our Data Model

Every dataset has a structure or a "schema". The `MedicalBilling` class below serves as our schema for the synthetic data. By defining this, we inform our synthetic data generator about the shape and nature of data we execpt.

In [4]:
class MedicalBilling(BaseModel):
    patient_id: int
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float

## 2.Sample Data

To guide the synthetic data generator, it is useful to provide it with a few real-world-like examples. These examples serve as a "seed" - they are representative of the kind of data we want, and the generator will use them to create data that looks similar to our expectations.

In [5]:
examples = [
    {
        'example': """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code:
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code:
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

## 3.Craft a Prompt Template

The generator does not know how to create our data; we need to guide it by creating a prompt template. This template helps instruct the underlying language model on how to produce synthetic data in the desired format.

In [6]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=['example'],
                                 template='{example}')

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    examples=examples,
    input_variables=['subject', 'extra'],
    example_prompt=OPENAI_TEMPLATE,
)

The `FewShotPromptTemplate` includes:
* `prefix` and `suffix`: contain guiding context or instructions.
* `examples`: sample data we defined earlier.
* `input_variables`: these variables (`'subject'`, `'extra'`) are placeholders we can dynamically fill later.
* `example_prompt`: prompt template is the format we want each example row to take in our prompt.

## 4.Creating the Data Generator

With the schema and the prompt ready, we can create the data generator. This object knows how to communicate with the underlying language model to generate synthetic data.

In [7]:
synthetic_data_generator = create_openai_data_generator(
    output_schema=MedicalBilling,
    llm=ChatOpenAI(model='gpt-3.5-turbo', temperature=1),
    prompt=prompt_template,
)

## 5.Generate Synthetic Data

In [8]:
synthetic_results = synthetic_data_generator.generate(
    subject='medical_billing',
    extra='the name must be chosen at random. Make it something you would not normally choose.',
    runs=10,
)

The output will be a list of the `MedicalBilling` pydantic model.

In [9]:
synthetic_results

[MedicalBilling(patient_id=987654, patient_name='Samantha Black', diagnosis_code='K29.7', procedure_code='99204', total_charge=400.0, insurance_claim_amount=300.0),
 MedicalBilling(patient_id=123456, patient_name='Xavier Brown', diagnosis_code='L13.9', procedure_code='99203', total_charge=250.0, insurance_claim_amount=200.0),
 MedicalBilling(patient_id=456789, patient_name='Jennifer Smith', diagnosis_code='M25.5', procedure_code='99213', total_charge=350.0, insurance_claim_amount=275.0),
 MedicalBilling(patient_id=789012, patient_name='Ezekiel Jackson', diagnosis_code='F41.1', procedure_code='99214', total_charge=500.0, insurance_claim_amount=400.0),
 MedicalBilling(patient_id=987654, patient_name='Aurora Patel', diagnosis_code='I10', procedure_code='99204', total_charge=450.0, insurance_claim_amount=350.0),
 MedicalBilling(patient_id=123456, patient_name='Xander Thompson', diagnosis_code='K22.3', procedure_code='99212', total_charge=300.0, insurance_claim_amount=240.0),
 MedicalBillin

## Other implementations

In [10]:
from langchain_experimental.synthetic_data import (
    DatasetGenerator,
    create_data_generation_chain,
)

In [11]:
# LLM
model = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.7)
chain = create_data_generation_chain(model)

In [12]:
chain({'fields': ['blue', 'yellow'], 'preferences': {}})

  chain({'fields': ['blue', 'yellow'], 'preferences': {}})


{'fields': ['blue', 'yellow'],
 'preferences': {},
 'text': 'The vibrant blue sky contrasted beautifully against the golden yellow sunflower fields, creating a stunning and picturesque scene that captured the essence of a perfect summer day.'}

In [13]:
chain(
    {
        "fields": {"colors": ["blue", "yellow"]},
        "preferences": {"style": "Make it in a style of a weather forecast."},
    }
)

{'fields': {'colors': ['blue', 'yellow']},
 'preferences': {'style': 'Make it in a style of a weather forecast.'},
 'text': "In today's weather forecast, skies will be a stunning blend of blue and yellow, creating a vibrant and picturesque horizon for all to enjoy."}

In [14]:
chain(
    {
        "fields": {"actor": "Tom Hanks", "movies": ["Forrest Gump", "Green Mile"]},
        "preferences": None,
    }
)

{'fields': {'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},
 'preferences': None,
 'text': 'Tom Hanks, known for his iconic roles in movies such as "Forrest Gump" and "Green Mile", has captivated audiences worldwide with his exceptional talent and versatility as an actor.'}

In [15]:
chain(
    {
        "fields": [
            {"actor": "Tom Hanks", "movies": ["Forrest Gump", "Green Mile"]},
            {"actor": "Mads Mikkelsen", "movies": ["Hannibal", "Another round"]},
        ],
        "preferences": {"minimum_length": 200, "style": "gossip"},
    }
)

{'fields': [{'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},
  {'actor': 'Mads Mikkelsen', 'movies': ['Hannibal', 'Another round']}],
 'preferences': {'minimum_length': 200, 'style': 'gossip'},
 'text': 'Rumor has it that Tom Hanks, known for his iconic roles in "Forrest Gump" and "Green Mile", has been spotted having lunch with the charming and enigmatic Mads Mikkelsen, star of "Hannibal" and "Another Round", sparking speculation of a potential collaboration between the two talented actors.'}

### Generating exemplary dataset for extraction benchmarking purposes

In [16]:
inp = [
    {
        "Actor": "Tom Hanks",
        "Film": [
            "Forrest Gump",
            "Saving Private Ryan",
            "The Green Mile",
            "Toy Story",
            "Catch Me If You Can",
        ],
    },
    {
        "Actor": "Tom Hardy",
        "Film": [
            "Inception",
            "The Dark Knight Rises",
            "Mad Max: Fury Road",
            "The Revenant",
            "Dunkirk",
        ],
    },
]

generator = DatasetGenerator(model,
                             {'style': 'informal',
                              'minimal length': 500})

dataset = generator(inp)

dataset

[{'fields': {'Actor': 'Tom Hanks',
   'Film': ['Forrest Gump',
    'Saving Private Ryan',
    'The Green Mile',
    'Toy Story',
    'Catch Me If You Can']},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Tom Hanks, known for his iconic roles in films such as Forrest Gump, Saving Private Ryan, The Green Mile, Toy Story, and Catch Me If You Can, has solidified himself as one of the most versatile and talented actors in Hollywood.'},
 {'fields': {'Actor': 'Tom Hardy',
   'Film': ['Inception',
    'The Dark Knight Rises',
    'Mad Max: Fury Road',
    'The Revenant',
    'Dunkirk']},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Tom Hardy, known for his versatile performances, has showcased his talent in a variety of blockbuster films such as "Inception," "The Dark Knight Rises," "Mad Max: Fury Road," "The Revenant," and "Dunkirk." His ability to seamlessly embody complex characters across different genres has solidified his reputatio

### Extraction from generated examples

We can extract output from this generated data and compare it with our case.

In [18]:
from typing import List

from langchain.chains import create_extraction_chain_pydantic
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from pydantic import BaseModel, Field

In [19]:
class Actor(BaseModel):
    Actor: str = Field(description='name of an actor')
    Film: List[str] = Field(description='list of names of films they starred in')

#### Parsers

In [22]:
llm = OpenAI()
parser = PydanticOutputParser(pydantic_object=Actor)

prompt = PromptTemplate(
    template="Extract fields from a give ntext.\n{format_instructions}\n{text}\n",
    input_variables=['text'],
    partial_variables={'format_instructions': parser.get_format_instructions()},
)

_input = prompt.format_prompt(text=dataset[0]['text'])
output = llm(_input.to_string())

parsed = parser.parse(output)
parsed

Actor(Actor='Tom Hanks', Film=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Toy Story', 'Catch Me If You Can'])

In [23]:
(parsed.Actor == inp[0]["Actor"]) & (parsed.Film == inp[0]["Film"])

True

#### Extractors

In [24]:
extractor = create_extraction_chain_pydantic(pydantic_schema=Actor, llm=model)
extracted = extractor.run(dataset[1]["text"])
extracted

  extractor = create_extraction_chain_pydantic(pydantic_schema=Actor, llm=model)
  extracted = extractor.run(dataset[1]["text"])


[Actor(Actor='Tom Hardy', Film=['Inception', 'The Dark Knight Rises', 'Mad Max: Fury Road', 'The Revenant', 'Dunkirk'])]

In [25]:
(extracted[0].Actor == inp[1]["Actor"]) & (extracted[0].Film == inp[1]["Film"])

True