In [53]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [54]:
#load data
icu_data = pd.read_csv('icu/icustays_cleaned.csv')
diagnosis = pd.read_csv('ed/diagnosis_cleaned.csv')
triage = pd.read_csv('ed/triage_cleaned.csv')
vitals = pd.read_csv('ed/vitals_cleaned.csv')
ed_stays = pd.read_csv('ed/edstays_cleaned.csv')

# Combine the icd_version and icd_code columns in diagnosis table
diagnosis['icd_combined'] = diagnosis['icd_version'].astype(str) + '-' + diagnosis['icd_code'].astype(str)
print(diagnosis.head())

unique_icd_combined_count = diagnosis['icd_combined'].nunique()
print('Number of unique icd_combined:', unique_icd_combined_count)

#print(ed_stays.columns)
#print(icu_data.columns)


   subject_id   stay_id  seq_num icd_code  icd_version  \
0    10000032  32952584        1     4589            9   
1    10000032  32952584        2    07070            9   
2    10000032  32952584        3      V08            9   
3    10000032  39399961        1    78097            9   
4    10000032  39399961        2    34830            9   

                                           icd_title icd_combined  
0                                    HYPOTENSION NOS       9-4589  
1  UNSPECIFIED VIRAL HEPATITIS C WITHOUT HEPATIC ...      9-07070  
2                         ASYMPTOMATIC HIV INFECTION        9-V08  
3                             ALTERED MENTAL STATUS       9-78097  
4                        ENCEPHALOPATHY, UNSPECIFIED      9-34830  
Number of unique icd_combined: 3894


In [55]:
# Function to calculate ranges for specified columns
def calculate_ranges(df, columns):
    ranges = {}
    for col in columns:
        col_data = df[col].replace(0.0, np.nan)  # Ignore values of 0
        ranges[col] = {
            'min': col_data.min(),
            'max': col_data.max()
        }
    return ranges

# Explore ranges in vitals and triage data
cols = ['heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']
vitals_ranges = calculate_ranges(vitals, cols)
print("Ranges in vitals_cleaned:", vitals_ranges)

triage_ranges = calculate_ranges(triage, cols)
print("Ranges in triage_cleaned:", triage_ranges)

# Merge datasets
merged_ed_icu = ed_stays[['stay_id', 'hadm_id']].merge(icu_data[['hadm_id', 'los']], on='hadm_id', how='inner')
merged_ed_icu = merged_ed_icu[['stay_id', 'hadm_id', 'los']]

# Select columns and rename duplicates for clarity
ed_features = pd.merge(
    triage[['stay_id_triage', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']],
    vitals[['stay_id_vitals', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']],
    left_on='stay_id_triage', right_on='stay_id_vitals', how='outer'
).rename(columns={'stay_id_triage': 'stay_id'}).drop(columns='stay_id_vitals')

final_data = pd.merge(ed_features, merged_ed_icu, on='stay_id', how='inner')
final_data = pd.merge(final_data, diagnosis[['stay_id', 'icd_combined']], on='stay_id', how='left')

# Impute missing values
def impute_missing_values(df, numeric_features, non_numeric_features):
    # Impute numeric columns using the mean
    numeric_imputer = SimpleImputer(strategy='mean')
    df[numeric_features] = numeric_imputer.fit_transform(df[numeric_features])
    
    # Impute non-numeric columns with a constant value, e.g., 'unknown'
    non_numeric_imputer = SimpleImputer(strategy='constant', fill_value='unknown')
    df[non_numeric_features] = non_numeric_imputer.fit_transform(df[non_numeric_features])
    
    return df

# Define numeric and non-numeric features for imputation
numeric_features = ['heartrate', 'resprate', 'o2sat', 'sbp', 'dbp']
non_numeric_features = ['icd_combined']

# Apply the imputation function
final_data = impute_missing_values(final_data, numeric_features, non_numeric_features)

# Function to calculate and handle outliers
def remove_outliers(df, col_ranges):
    for col, (min_val, max_val) in col_ranges.items():
        df[col] = np.where(df[col] < min_val, min_val, df[col])
        df[col] = np.where(df[col] > max_val, max_val, df[col])
    return df

col_ranges = {
    'heartrate': (30, 180),
    'resprate': (8, 60),
    'o2sat': (50, 100),
    'sbp': (60, 250),
    'dbp': (30, 150),
}
final_data = remove_outliers(final_data, col_ranges)

# Normalize data
id_columns = ['stay_id', 'hadm_id', 'icd_combined']
normalize_cols = ['heartrate', 'resprate', 'o2sat', 'sbp', 'dbp',]

# Separate data into two parts: identifiers and features to normalize
id_data = final_data[id_columns]
features_to_normalize = final_data[normalize_cols]

# Normalize only the numeric columns
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features_to_normalize)
normalized_features_df = pd.DataFrame(normalized_features, columns=normalize_cols)

# Concatenate the identifier columns with the normalized features
final_data_normalized = pd.concat([id_data.reset_index(drop=True), normalized_features_df.reset_index(drop=True)], axis=1)

#  Handle missing values
final_data_normalized['icd_combined'] = final_data_normalized['icd_combined'].fillna('unknown')

# Save the processed data
final_data_normalized.to_csv('preprocessed_data.csv', index=False)
print("Final preprocessed data saved to 'preprocessed_data.csv'")



Ranges in vitals_cleaned: {'heartrate': {'min': np.float64(1.0), 'max': np.float64(705.0)}, 'resprate': {'min': np.float64(0.34), 'max': np.float64(78.0)}, 'o2sat': {'min': np.float64(1.0), 'max': np.float64(972.0)}, 'sbp': {'min': np.float64(2.0), 'max': np.float64(854.0)}, 'dbp': {'min': np.float64(2.0), 'max': np.float64(97100.0)}}
Ranges in triage_cleaned: {'heartrate': {'min': np.float64(14.0), 'max': np.float64(1228.0)}, 'resprate': {'min': np.float64(1.0), 'max': np.float64(189.0)}, 'o2sat': {'min': np.float64(2.0), 'max': np.float64(1004.0)}, 'sbp': {'min': np.float64(1.0), 'max': np.float64(9656.0)}, 'dbp': {'min': np.float64(4.0), 'max': np.float64(9102.0)}}


KeyError: 'stay_id_vitals'