### Ultrasound ED Daily Conversion Rate ###

Import Relevant Packages

In [86]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import scipy as sp 
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, AdaBoostRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import SelectFromModel
from scipy.stats import randint, uniform
from sklearn.model_selection import learning_curve
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib
import xgboost as xgb
import gzip


Import CSV & Data Preparation

In [87]:
# Load the dataset
df = pd.read_csv('/Users/hari/Documents/synthetic_ed_data_7_years.csv')

# Drop irrelevant columns
df = df.drop(columns=['Patient Name'])

# Map Y/N values to 1 and 0 for 'Ordered Ultrasound (Y/N)'
df['Ordered Ultrasound (Y/N)'] = df['Ordered Ultrasound (Y/N)'].map({'Y': 1, 'N': 0})

# Create a new column 'Ordered Ultrasound Dummified'
df['Ordered Ultrasound Dummified'] = df['Ordered Ultrasound (Y/N)']

# Replace NaN values for Ultrasound Types, Date Ordered with 'Not Ordered' when ultrasound not ordered
df.loc[df['Ordered Ultrasound (Y/N)'] == 0, 'Ultrasound Type'] = 'Not Ordered'
df.loc[df['Ordered Ultrasound (Y/N)'] == 0, 'Time Ultrasound Ordered'] = 'Not Ordered'
df.loc[df['Ordered Ultrasound (Y/N)'] == 0, 'Date Ultrasound Ordered'] = 'Not Ordered'

# Remove priority codes where ultrasound was not ordered
priority_code_columns = [col for col in df.columns if col.startswith('Priority Code')]
df.loc[df['Ordered Ultrasound (Y/N)'] == 0, priority_code_columns] = None

# Dummify categorical columns
df = pd.get_dummies(df, columns=['Priority Code', 'Ultrasound Type', 'Attending Physician'], dummy_na=True)

# Convert age to years and apply binning
def convert_age_to_years(age_str):
    if isinstance(age_str, str):
        if 'year' in age_str:
            age_value = int(age_str.split()[0])
            return age_value
        elif 'month' in age_str:
            age_value = int(age_str.split()[0])
            return age_value / 12
    return None

df['Age in Years'] = df['Age'].apply(convert_age_to_years)
df = df.drop(columns=['Age'])

# Apply binning technique for age groups.
bins = [0, 1, 5, 12, 18]
labels = ['Infant', 'Toddler', 'Child', 'Teen']
df['Age Group'] = pd.cut(df['Age in Years'], bins=bins, labels=labels, right=False)

# ED Time of Day Arrival, AM/PM, Business Hours, Off Hours
df['Is_AM'] = pd.to_datetime(df['ED Arrival Time']).dt.hour < 12
df['Is_PM'] = pd.to_datetime(df['ED Arrival Time']).dt.hour > 12
df['Is_Business_Hour'] = df['ED Arrival Time'].apply(lambda x: 8 <= pd.to_datetime(x).hour < 19)

df['Is_AM_Business_Hours'] = df['Is_AM'] & df['Is_Business_Hour']
df['Is_PM_Business_Hours'] = df['Is_PM'] & df['Is_Business_Hour']

# Day of the Week (ED Arrival)
df['Day_of_Week'] = pd.to_datetime(df['ED Arrival Date']).dt.dayofweek
df['Is_Weekend'] = df['Day_of_Week'].apply(lambda x: 1 if x >= 5 else 0)

# Calculate time between patient arrivals
df['Time_Between_Arrivals'] = pd.to_datetime(df['ED Arrival Date']).diff().dt.total_seconds() / 60

# Aggregating daily conversion rates
daily_conversion = df.groupby('ED Arrival Date').agg(
    Total_Arrivals=('MRN', 'count'),
    Ultrasound_Requests=('Ordered Ultrasound Dummified', 'sum')
)

# Calculate Conversion Rate
daily_conversion['Conversion_Rate'] = (daily_conversion['Ultrasound_Requests'] / daily_conversion['Total_Arrivals']) * 100
daily_conversion = daily_conversion.reset_index()

# Merge daily conversion rates back into the original dataframe
df = df.merge(daily_conversion[['ED Arrival Date', 'Conversion_Rate']], on='ED Arrival Date', how='left')

  df['Is_AM'] = pd.to_datetime(df['ED Arrival Time']).dt.hour < 12
  df['Is_PM'] = pd.to_datetime(df['ED Arrival Time']).dt.hour > 12


Train Test Set & Model

In [88]:
# Drop columns that are no longer needed
columns_to_drop = [
    'MRN', 'ED Arrival Time', 'ED Arrival Date', 'ED Patient Initial Assessment Time',
    'Time Ultrasound Ordered', 'Date Ultrasound Ordered', 'Ordered Ultrasound Dummified',
    'Priority Code_P1', 'Priority Code_P2', 'Priority Code_P3', 'Priority Code_P4', 'Chief Complaint',
    'Ultrasound Type_Not Ordered', 'Ultrasound Type_US Abdomen', 'Ultrasound Type_US Abdomen Pelvis',
    'Ultrasound Type_US Lower Extremity Doppler', 'Ultrasound Type_US Pelvic', 'Ultrasound Type_US Scrotal',
    'Ultrasound Type_US Thyroid', 'Ultrasound Type_US Upper Extremity Doppler',
    'Attending Physician_Andrea Greene MD', 'Attending Physician_Andrew Cole MD',
    'Attending Physician_Ashley Serrano MD', 'Attending Physician_Carolyn Klein MD',
    'Attending Physician_Charles Wright MD', 'Attending Physician_Christopher Christensen MD',
    'Attending Physician_Christopher Long MD', 'Attending Physician_Connie Daniels MD',
    'Attending Physician_Gerald Baldwin MD', 'Attending Physician_Jessica Allen MD',
    'Attending Physician_Kari Stewart MD', 'Attending Physician_Kevin Harrison MD',
    'Attending Physician_Kimberly Hopkins MD', 'Attending Physician_Matthew Walter MD',
    'Attending Physician_Melissa Guerra MD', 'Attending Physician_Patrick Burton MD',
    'Attending Physician_Robert Carr MD', 'Attending Physician_Sara Gray MD',
    'Attending Physician_Sonia Mays MD', 'Attending Physician_Susan Lucas MD', 'Month', 'Year', 'ED Arrival Timestamp'
]

# Drop only the columns that exist in the DataFrame
columns_to_drop_existing = [col for col in columns_to_drop if col in df.columns]
df = df.drop(columns=columns_to_drop_existing)

# Handle missing values for numerical columns
for col in df.select_dtypes(include=[np.number]).columns:
    df[col] = df[col].fillna(0)

# Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# Define features and target variable
features = df.drop(columns=['Conversion_Rate'])
target = df['Conversion_Rate']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize XGBoost Regressor
xg_reg = xgb.XGBRegressor(n_estimators=393, max_depth=20, learning_rate=0.1, random_state=42)

# Train the XGBoost model
xg_reg.fit(X_train, y_train)

# Make predictions
y_pred = xg_reg.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# # Generator for features
# def feature_generator(X_test_selected, selected_columns):
#     # Generate features for prediction
#     for i, row in enumerate(X_test_selected):
#         feature_set = {selected_columns[j]: row[j] for j in range(len(selected_columns))}
#         yield feature_set

# # usage
# for feature_set in feature_generator(X_test_selected, selected_columns):
#     print(feature_set)  # Print feature set for each row (can be fed into your app)

Mean Absolute Error: 3.14381147876115
Mean Squared Error: 15.874280645588081
R-squared: 0.0032244983599921673


In [89]:
# Save the XGBoost model with joblib compression
model_path = '/Users/hari/Documents/ULTFORM/xgboost_model.pkl'
joblib.dump(xg_reg, model_path)

# Compress the model using gzip compression level 3
compressed_model_path = '/Users/hari/Documents/ULTFORM/xgboost_model_compressed.pkl'
joblib.dump(xg_reg, compressed_model_path, compress=('gzip', 3))

print(f"Model compressed and saved successfully at {compressed_model_path}.")

# Print the columns used for synthetic data generation
required_columns = X_train.columns
print("\nColumns required for synthetic patient generation:")
for column in required_columns:
    print(column)

Model compressed and saved successfully at /Users/hari/Documents/ULTFORM/xgboost_model_compressed.pkl.

Columns required for synthetic patient generation:
Ordered Ultrasound (Y/N)
Priority Code_nan
Ultrasound Type_nan
Attending Physician_nan
Age in Years
Is_AM
Is_PM
Is_Business_Hour
Is_AM_Business_Hours
Is_PM_Business_Hours
Day_of_Week
Is_Weekend
Time_Between_Arrivals
Age Group_Toddler
Age Group_Child
Age Group_Teen
