In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from collections import Counter
import numpy as np

In [None]:
# Load CSVs
psi_df = pd.read_csv('Created_Data/psi_df_without_transfer.csv')
psi_count_df = pd.read_csv('Created_Data/psi_count_df.csv')
off_unit_df = pd.read_csv('Created_Data/off_unit_transfers.csv')
pat_nights = pd.read_csv('Created_Data/patient_night_population.csv')
episode_durations = pd.read_csv('Created_Data/episode_durations.csv')
admission_details = pd.read_csv('Queried_Final/admission_details.csv')
luso_epcount = pd.read_csv('Created_Data/luso_episodecount.csv')

In [None]:
# Create unique index for each patient night
pat_nights.reset_index(inplace=True)
pat_nights.rename(columns={'index': 'pat_night_index'}, inplace=True)

### Table 1: Demographics of patients in the study

In [None]:
admission_details['PAT_ENC_CSN_ID'].nunique()

In [None]:
unique_pat_encs = pat_nights['PAT_ENC_CSN_ID'].unique().tolist()
len(unique_pat_encs)

In [None]:
demographics_df = admission_details[admission_details['PAT_ENC_CSN_ID'].isin(unique_pat_encs)]
demographics_df = demographics_df.drop_duplicates(subset=['PAT_ENC_CSN_ID'])
demographics_df['PAT_ENC_CSN_ID'].value_counts()

In [None]:
demographics_df['PAT_ID'].nunique()

In [None]:
demographics_df = demographics_df.drop_duplicates(subset=['PAT_ID'])

# Custom age ranges
bins = [0, 50, 60, 70, 80, float('inf')]
labels = ['<50', '50-59', '60-69', '70-79', '80+']
demographics_df['AGE_RANGE'] = pd.cut(demographics_df['PATIENT_AGE_YEARS'], bins=bins, labels=labels, right=False)
demographics_df

In [None]:
# Get count and percentage of each gender in population
values = demographics_df['GENDER'].value_counts()
percents= demographics_df['GENDER'].value_counts(normalize=True) 
values,percents

In [None]:
# Get count and percentage of each race in population
values = demographics_df['PATIENT_RACE'].value_counts()
percents= demographics_df['PATIENT_RACE'].value_counts(normalize=True) 
values,percents

In [None]:
# Get count and percentage of each age group in population
values = demographics_df['AGE_RANGE'].value_counts()
percents= demographics_df['AGE_RANGE'].value_counts(normalize=True) 
values,percents

In [None]:
demographics_df['HOSP_DISCH_TIME'] = pd.to_datetime(demographics_df['HOSP_DISCH_TIME'])
demographics_df['HOSP_ADMSN_TIME'] = pd.to_datetime(demographics_df['HOSP_ADMSN_TIME'])

def count_nights(start, end):
    # Count full nights between start and end
    if end.date() > start.date():
        # Calculate the difference in days and subtract 1 for the start day
        return (end.date() - start.date()).days
    return 0

# Apply the function to the DataFrame
demographics_df['hosp_nights'] = demographics_df.apply(lambda row: count_nights(row['HOSP_ADMSN_TIME'], row['HOSP_DISCH_TIME']), axis=1)

# demographics_df['hosp_nights'].value_counts()

# Creating hospital night ranges
bins = [0, 6, 11, 16, 21, float('inf')]
labels = ['1-5', '6-10', '11-15', '16-20', '21+']
demographics_df['hosp_nights_ranges'] = pd.cut(demographics_df['hosp_nights'], bins=bins, labels=labels, right=False)

 # Get count and percentage of each admission length range
demographics_df
values = demographics_df['hosp_nights_ranges'].value_counts()
percents= demographics_df['hosp_nights_ranges'].value_counts(normalize=True) 
values,percents

In [None]:
# Get count and percentage of nights on the unit
values = demographics_df['NIGHTS_ON_51600'].value_counts()
percents= demographics_df['NIGHTS_ON_51600'].value_counts(normalize=True) 
values,percents

In [None]:
# Get count and percentage of admission diagnoses
values = demographics_df['ADMISSION_DX_NAME'].value_counts()
percents= demographics_df['ADMISSION_DX_NAME'].value_counts(normalize=True) 
values,percents

In [None]:
# List of strings that should result in a 1
stroke_names = [
                'Stroke', 
                'ICH (intracerebral hemorrhage)', 
                'SAH (subarachnoid hemorrhage)', 
                'Cerebrovascular accident (CVA), unspecified mechanism',          
                'Subdural hematoma',
                'SDH (subdural hematoma)',
                'Subarachnoid hemorrhage',
                'Intraparenchymal hemorrhage of brain',
                'CVA (cerebral vascular accident)',
                'Vertebral artery dissection',
                'Intracranial hemorrhage',
                'Subdural hemorrhage',
                'Hemorrhagic stroke',
                'Acute ischemic left MCA stroke',
                'Basal ganglia hemorrhage',
                'Intracerebral hemorrhage',
                'Stroke due to embolism of basilar artery',
                'ICAO (internal carotid artery occlusion)',
                'Cerebral hemorrhage',
                'Other left-sided nontraumatic intracerebral hemorrhage',
                'Traumatic subarachnoid hematoma with loss of consciousness',
                'Occipital stroke',
                'Nontraumatic intracerebral hemorrhage, unspecified cerebral location, unspecified laterality',
                'Intraventricular hemorrhage',
                'Traumatic subarachnoid hemorrhage with loss of consciousness of 30 minutes or less, initial encounter',
                'Traumatic right-sided intracerebral hemorrhage with loss of consciousness, initial encounter',
                'Intraparenchymal hematoma of brain, left, with unknown loss of consciousness status, initial encounter',
                'Cerebellar hemorrhage',
                'Intraparenchymal hematoma of brain without loss of consciousness, unspecified laterality, initial encounter',
                'ICAO (internal carotid artery occlusion), left',
                'Thalamic hemorrhage',
                'Traumatic subdural hemorrhage with loss of consciousness of 30 minutes or less, initial encounter',
                'Intraparenchymal hematoma of brain, left, without loss of consciousness, initial encounter'         
               ]

# Create a new binary column based on whether 'col1' equals any of the target strings
admission_details['STROKE'] = np.where(admission_details['ADMISSION_DX_NAME'].isin(stroke_names), 1, 0)
admission_details

### Results: Statistics from the interruption-count algorithm

In [None]:
# Display summary statistics for PSI count
psi_count_df.describe()

In [None]:
# Display summary statistics for LUSO and interruptive episode count
luso_epcount.describe()

In [None]:
# Get frequency and percentage of number of PSIs in an interruptuive episode
luso_epcount['psis_in_episode'] = luso_epcount['GROUPED_TIMES'].apply(lambda x: [len(sublist) for sublist in x])
combined_values = [value for sublist in luso_epcount['psis_in_episode'] for value in sublist]
value_counts = Counter(combined_values)
value_counts_dict = dict(value_counts)

# Create dataframe
value_counts_df = pd.DataFrame(value_counts_dict.items(), columns=['PSIs in an Episode', 'Frequency'])
value_counts_df = value_counts_df.sort_values(by='PSIs in an Episode', ascending=True)
value_counts_df = value_counts_df.reset_index(drop=True)

# Add percentage column
total_count = value_counts_df['Frequency'].sum()
value_counts_df['Percentage'] = (value_counts_df['Frequency'] / total_count) * 100
value_counts_df['Percentage'] = value_counts_df['Percentage'].apply(lambda x: '{:.2f}%'.format(x))

value_counts_df

### Results: Number of PSIs by type

In [None]:
psi_df['PSI_TYPE'].value_counts()

In [None]:
value_counts_normalized = psi_df['PSI_TYPE'].value_counts(normalize=True)
formatted_percentages = value_counts_normalized.map(lambda x: '{:.1f}'.format(x * 100))
formatted_percentages

### Results: Interruptive Episode Count & LUSO Distributions

In [None]:
luso_epcount['LUSO_hours'] = 'Unknown'
luso_epcount.loc[luso_epcount['LUSO_RANGE'] == '0-59', 'LUSO_hours'] = '0:00-0:59'
luso_epcount.loc[luso_epcount['LUSO_RANGE'] == '60-119', 'LUSO_hours'] = '1:00-1:59'
luso_epcount.loc[luso_epcount['LUSO_RANGE'] == '120-179', 'LUSO_hours'] = '2:00-2:59'
luso_epcount.loc[luso_epcount['LUSO_RANGE'] == '180-239', 'LUSO_hours'] = '3:00-3:59'
luso_epcount.loc[luso_epcount['LUSO_RANGE'] == '240-299', 'LUSO_hours'] = '4:00-4:59'
luso_epcount.loc[luso_epcount['LUSO_RANGE'] == '300-359', 'LUSO_hours'] = '5:00-5:59'
luso_epcount.loc[luso_epcount['LUSO_RANGE'] == '360-419', 'LUSO_hours'] = '6:00-6:59'
luso_epcount.loc[luso_epcount['LUSO_RANGE'] == '420-420', 'LUSO_hours'] = '7:00'
luso_epcount

In [None]:
df = luso_epcount['LUSO_hours'].value_counts()
df2 = pd.DataFrame({'Range': df.index, 'Count': df.values})

# Extract numerical values from the strings
df2['Numerical Value'] = df2['Range'].str.split('-').str[0].str.split(':').str[0].astype(int)

# Sort the DataFrame in ascending order based on the numerical value
df_sorted = df2.sort_values(by='Numerical Value')
df_sorted.drop(columns=['Numerical Value'], inplace=True)


In [None]:
df = luso_epcount['NUM_EPISODES'].value_counts()

epcount = pd.DataFrame({'Range': df.index, 'Count': df.values})
epcount

### Interruptive Episode and LUSO Duration Analysis

In [None]:
episode_durations.describe()

In [None]:
episode_durations.value_counts()

In [None]:
# Create CDF of interrutpive episode duration
sorted_data = episode_durations['Interruptive Episode Duration'].sort_values()
cumulative = sorted_data.cumsum() / sorted_data.sum()

plt.plot(sorted_data, cumulative, marker='o', linestyle='-', color='blue')
plt.xlabel('Duration')
plt.ylabel('Frequency')
plt.title('Distribution of Interruptive Episode Duration')
plt.show()

In [None]:
# Splitting LUSO into bins
bin_edges = [0, 60, 120, 180, 240, 300, 360, 420, 421]
bin_labels = [f'{bin_edges[i]}-{(bin_edges[i+1])-1}' for i in range(len(bin_edges) - 1)]
luso_epcount['LUSO_RANGE'] = pd.cut(luso_epcount['LUSO'], bins=bin_edges, labels=bin_labels, right=False)
luso_epcount

In [None]:
luso_epcount['LUSO_RANGE'].value_counts()

### Results: Pearson's Correlation

In [None]:
luso_epcount = luso_epcount[['PAT_ENC_CSN_ID','NUM_EPISODES','LUSO']]
luso_epcount

In [None]:
grouped_df = luso_epcount.groupby('PAT_ENC_CSN_ID').mean().reset_index()
grouped_df

In [None]:
def add_trendline(x, y, ax, color):
    # Fit a linear trend line to the data
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    ax.plot(x, p(x), color=color)
    return z

def calculate_correlation(x, y):
    correlation_matrix = np.corrcoef(x, y)
    correlation = correlation_matrix[0, 1]
    return correlation

# Create a scatter plot
fig, ax = plt.subplots(figsize=(10, 6))

# Scatter plot for Column1 vs Column2
ax.scatter(grouped_df['NUM_EPISODES'], grouped_df['LUSO'], color='blue', label='Episode Count vs. LUSO')
trendline1 = add_trendline(grouped_df['NUM_EPISODES'], grouped_df['LUSO'], ax, 'blue')
corr1 = calculate_correlation(grouped_df['NUM_EPISODES'], grouped_df['LUSO'])


# Adding titles and labels
ax.set_title('Scatter Plot of Averaged Columns with Trend Lines')
ax.set_xlabel('Values')
ax.set_ylabel('Values')
ax.legend()


# Display the plot
plt.show()


In [None]:
print(corr1)

### Regression Analysis

In [None]:
regression = pd.read_csv(r'regression_los_stroke.csv')

regression['NIGHT_START'] = pd.to_datetime(regression['NIGHT_START'])

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm


# Fit the mixed-effects model
model = mixedlm("LUSO ~ VITALS +  NEUROS + MEDS + BEDSIDE_TESTING  + OFF_UNIT_TESTING + ON_UNIT_TRANSFER + Days + STROKE", regression, groups=regression["PAT_ENC_CSN_ID"])
result = model.fit()
print(result.summary())

In [None]:
# Extract variance components
group_variance = result.cov_re.iloc[0, 0]  # Random effect variance
residual_variance = result.scale  # Residual variance

# Calculate ICC
icc = group_variance / (group_variance + residual_variance)
print(f"Intraclass Correlation Coefficient (ICC): {icc}")
