# Medical Abbreviation Data Preparation Tool

This notebook processes medical descriptions from ICD-10 codes to create training data for a model that converts medical terms into standardized abbreviated formats.

In [1]:
# Import required libraries
import pandas as pd
import json
import numpy as np
from openai import OpenAI
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

## 1. Sample Data Creation

Create example pairs of original medical terms and their abbreviated versions

In [2]:
# Sample data: Original terms and their abbreviated versions
icd_terms = [
    ("Foreign body granuloma of soft tissue, not elsewhere classified, unspecified hand", 
     "Foreign body gran. of soft tissue, not elsewhere class., unspec. hand"),
    
    ("Disease of spinal cord, unspecified", 
     "Disease of spin. cord, unspec."),
    
    ("Acute respiratory failure, unspecified whether with hypoxia or hypercapnia", 
     "Acute resp. failure, unspec. whether w/ hypoxia or hypercap."),
    
    ("Hordeolum internum right eye, unspecified eyelid", 
     "Hordeolum int. rt. eye, unspec. eyelid"),
    
    ("Viral infection, unspecified", 
     "Viral infect., unspec."),
    
    # Add more pairs as needed
]

In [3]:
# Create training data in JSONL format for fine-tuning
with open('fine_tuning_abb.jsonl', 'w') as outfile:
    for original, abbreviated in icd_terms:
        json_obj = {
            "messages": [
                {
                    "role": "system", 
                    "content": "Generate the input ICD-10 descriptions with one or more standard medical abbreviations (using dots, e.g., 'unspecified' → 'unspec.', 'respiratory' → 'resp.'), while maintaining word order and medical meaning"
                },
                {
                    "role": "user", 
                    "content": original
                },
                {
                    "role": "assistant", 
                    "content": abbreviated
                }
            ]
        }
        json_line = json.dumps(json_obj)
        outfile.write(json_line + '\n')
print("Training data saved to fine_tuning_abb.jsonl")

Training data saved to fine_tuning_abb.jsonl


## 2. Process ICD-10 Codes Dataset

In [4]:
# Initialize OpenAI client (replace with your API key)
client = OpenAI(api_key='YOUR_API_KEY')

In [5]:
# Load ICD-10 dataset (adjust the path as needed)
filename = 'icd_codes_output.jsonl'

# Read the JSONL file into a DataFrame
df = pd.read_json(filename, lines=True)

# Create columns for different message roles
df['system'] = None
df['user'] = None
df['assistant'] = None

# Extract message content by role
for index, row in df.iterrows():
    for message in row['messages']:
        if message['role'] == 'system':
            df.at[index, 'system'] = message['content']
        elif message['role'] == 'user':
            df.at[index, 'user'] = message['content']
        elif message['role'] == 'assistant':
            df.at[index, 'assistant'] = message['content']

# Remove original messages column and rename for clarity
df.drop('messages', axis=1, inplace=True)
df.rename(columns={'assistant': 'ground_truth'}, inplace=True)

In [6]:
# Sample a subset of the data for processing
sampled_df = df.sample(n=10000, random_state=1)  # For reproducibility
sampled_df['abbreviated_description'] = None

## 3. Apply Abbreviations with Fine-tuned Model

In [7]:
def process_text(text):
    """Process text using fine-tuned model to generate abbreviated medical descriptions"""
    response = client.chat.completions.create(
        model="ft:gpt-4o-mini-2024-07-18:personal::AOfLUHmg",  # Replace with your fine-tuned model ID
        max_tokens=1000,
        messages=[
            {"role": "system", "content": "Generate the input ICD-10 descriptions with one or more standard medical abbreviations (using dots, e.g., 'unspecified' → 'unspec.', 'respiratory' → 'resp.'), while maintaining word order and medical meaning"},
            {"role": "user", "content": text},
        ]
    )
    return response.choices[0].message.content

def process_row(index, row):
    """Process a single row from the dataframe"""
    text = row['user']
    result = process_text(text)
    return (index, result)

In [8]:
# Process batch of descriptions using multiple threads
results = {}
batch_size = 1000  # Adjust based on your needs
df_to_process = sampled_df.iloc[:batch_size]

with ThreadPoolExecutor(max_workers=30) as executor:
    future_to_row = {executor.submit(process_row, idx, row): idx for idx, row in df_to_process.iterrows()}
    for future in tqdm(as_completed(future_to_row), total=len(df_to_process), desc="Processing text"):
        index, result = future.result()
        results[index] = result

# Update dataframe with results
for index, result in results.items():
    sampled_df.at[index, 'abbreviated_description'] = result

Processing text: 100%|██████████| 10000/10000 [1:45:12<00:00, 1.58it/s]


In [9]:
# Save the processed data
sampled_df.to_csv('processed_medical_abbreviations.csv', index=False)

## 4. Generate Training Data for Model Fine-tuning

In [10]:
# Filter rows where abbreviation was actually applied
changed_df = sampled_df[sampled_df['user'] != sampled_df['abbreviated_description']]

# Create JSONL file for fine-tuning from processed data
with open('abbreviated_medical_terms_training.jsonl', 'w') as outfile:
    for index, row in changed_df.iterrows():
        json_obj = {
            "messages": [
                {
                    "role": "system",
                    "content": "You generate accurate ICD-10 codes based on descriptions."
                },
                {
                    "role": "user",
                    "content": row['abbreviated_description']
                },
                {
                    "role": "assistant",
                    "content": row['ground_truth']
                }
            ]
        }
        json_line = json.dumps(json_obj)
        outfile.write(json_line + '\n')

print("Fine-tuning data generated successfully")

Fine-tuning data generated successfully


## 5. Analysis

In [11]:
# Count how many descriptions were actually abbreviated
changed_count = len(sampled_df[sampled_df['user'] != sampled_df['abbreviated_description']])
total_count = len(sampled_df)
percentage = (changed_count / total_count) * 100

print(f"Number of descriptions that were abbreviated: {changed_count}")
print(f"Percentage of data that was abbreviated: {percentage:.2f}%")

Number of descriptions that were abbreviated: 7791
Percentage of data that was abbreviated: 77.91%


In [12]:
# Display a few examples of the abbreviation results
sample_results = sampled_df[['system', 'user', 'ground_truth', 'abbreviated_description']].head(3)
sample_results

Unnamed: 0,system,user,ground_truth,abbreviated_description
0,You generate accurate ICD-10 codes based on de...,Sjogren syndrome with myopathy,M3503,Sjogren syndrome with myopath.
1,You generate accurate ICD-10 codes based on de...,"Toxic effect of strychnine and its salts, assa...",T651X3A,"Toxic effect of strychnine and its salts, assa..."
2,You generate accurate ICD-10 codes based on de...,"Generalized gingival recession, unspecified",K06020,"Generalized gingival recession, unspec."
