In [None]:
%pip install pandas
%pip install numpy
%pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
subset= pd.read_csv('second_model_subset.csv')
features = pd.read_csv('second_models_features.csv')

In [None]:
path = 'chest-xrays/'
segmented = pd.read_csv(path+"CXLSeg-segmented.csv")
xray = pd.read_csv(path+'CXLSeg-metadata.csv')



In [None]:
def date_format(date):
    date = str(date)
    formatted_date = f"{date[:4]}-{date[4:6]}-{date[6:]}"
    return formatted_date

def time_format(time):
    time = str(time)
    time = time.split(".")[0]
    while len(time) != 6:
        time = "0" + time
    formatted_time = f"{time[:2]}:{time[2:4]}:{time[4:6]}"
    return formatted_time

def convert_datetime(input_date):
    return datetime.fromisoformat(input_date)





In [None]:
subset['suspected_infection_time'].iloc[32971]

In [None]:
subset["admittime"] = pd.to_datetime(subset["admittime"])
subset["dischtime"] = pd.to_datetime(subset["dischtime"])
subset['suspected_infection_time'] = pd.to_datetime(subset['suspected_infection_time'])

xray = xray.assign(formatted_date = xray["StudyDate"].apply(date_format))
xray = xray.assign(formatted_time = xray["StudyTime"].apply(time_format))
xray = xray.assign(studytime = (xray["formatted_date"] + " " + xray["formatted_time"]).apply(convert_datetime))

In [None]:
subset['suspected_infection_time'].isna()

In [None]:


subset = subset[\
    (((subset['suspected_infection_time'].dt.normalize()-subset['admittime'].dt.normalize()).dt.days)>=0)\
    | (((subset['suspected_infection_time'].dt.normalize()-subset['admittime'].dt.normalize()).dt.days).isna())
]

In [None]:
subset['days'] = (subset['suspected_infection_time'].dt.normalize()-subset['admittime'].dt.normalize()).dt.days
subset['days'] = subset['days'].fillna(-1)
subset

In [None]:
# Subsetting xray dataset to make merge more efficient
xray_merge = xray[["subject_id", "study_id", "ViewPosition", "studytime"]]
# First merge
merging = subset.merge(xray_merge, left_on = "subject_id", right_on = "subject_id")
# Matching each xray to hospital admission
matched_dates = merging[(merging["studytime"] >= merging["admittime"]) & (merging["studytime"] <= merging["dischtime"])].reset_index(drop = True)
# Preprocessing segmented for merging
segmented_merged = segmented[["subject_id", "study_id", "dicom_id", "DicomPath", "No Finding"]]
segmented_merged["No Finding"] = segmented_merged["No Finding"].fillna(-1)
segmented_merged["Abnormal"] = (segmented_merged["No Finding"] * -1)
segmented_merged = segmented_merged.drop(columns = ["No Finding"])
# Final merge
complete_merged = matched_dates.merge(segmented_merged, on = ["subject_id", "study_id"])[["subject_id", "hadm_id", "stay_id", "study_id", 
                                                                       "admittime", "dischtime", "days", "studytime", "ViewPosition",
                                                                       "dicom_id", "DicomPath", "Abnormal", "los", 
                                                                       "chronic_pulmonary_disease", "sepsis3"]]

complete_merged

In [None]:
features = pd.read_csv('second_models_features.csv')
features = features[features['subject_id'].notna()]

In [None]:
# sub = features[features['subject_id']==16192578.0]
# sub.sort_values(by = 'charttime', ascending = False)

# sub.merge(sub.groupby(['subject_id', 'hadm_id', 'stay_id'])[['charttime']].max().reset_index(), on = 'charttime').columns
len(features.columns)

In [None]:
# recents = features.groupby(['subject_id', 'hadm_id', 'stay_id'])[['charttime']].max().reset_index()
# features.merge(recents, on = ['subject_id', 'hadm_id', 'stay_id', 'charttime'])
recents = features.sort_values(['subject_id', 'hadm_id', 'stay_id', 'charttime']).groupby(['subject_id', 'hadm_id', 'stay_id']).tail(1)
# recents
recents = recents.reset_index().drop(columns = 'index')
recents.iloc[0]

In [None]:
means = features.groupby(['subject_id', 'hadm_id', 'stay_id'])[['heart_rate', 'sbp',
       'sbp_ni', 'mbp', 'mbp_ni', 'resp_rate', 'temperature', 'platelet',
       'wbc', 'bands', 'lactate', 'inr', 'ptt', 'creatinine', 'bilirubin']].mean().reset_index()
means.iloc[0]

In [None]:
feat_squeeze = recents.combine_first(means)
feat_squeeze.notna().sum()/(feat_squeeze.isna().sum()+feat_squeeze.notna().sum())


feat_squeeze

In [None]:
features.info()

In [None]:
full_data = complete_merged.merge(feat_squeeze, how = 'left', on = ['subject_id', 'hadm_id', 'stay_id'])
full_data = full_data.drop_duplicates('dicom_id')


In [None]:
full_data.columns
feats = ['Abnormal', 'bilirubin', 'creatinine', 'heart_rate', 'inr', 'mbp', 'platelet',
       'ptt', 'resp_rate', 'sbp', 'wbc', 'days']

X = full_data[feats].dropna().drop(columns = 'days')
y = full_data[feats].dropna()['days'].apply(lambda x: '3+' if x > 3 else str(x))


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score



In [None]:



# Normalize or standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert target labels to binary format (multilabel)
y = pd.get_dummies(full_data[feats].dropna()['days'].apply(lambda x: '3+' if x > 3 else str(x)))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the multilabel logistic regression model
log_reg = LogisticRegression(max_iter=1000)
multi_target_model = MultiOutputClassifier(log_reg)
multi_target_model.fit(X_train, y_train)

# Predict on the test set
y_pred = multi_target_model.predict(X_test)

# Evaluate the model (example: accuracy score for each label)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
# full_data.columns
# full_data[['subject_id', 'hadm_id', 'stay_id_x', 'study_id', 'admittime',
#        'dischtime', 'studytime', 'ViewPosition', 'dicom_id', 'charttime']]

In [None]:
# sample[sample['stay_id_x']==30000646]

# sample[sample['stay_id_x']==30000646]['study_id'].value_counts()

# (sample[sample['study_id']==55490538]['charttime']-sample[sample['study_id']==55490538]['studytime']).sort_values(ascending=False)

In [None]:
# sample['days'] = abs(sample['studytime']-sample['charttime'])

# sample

In [None]:
# for category, group in sample.groupby(['hadm_id', 'study_id']):
#     print(f"Category: {category}")
#     print(group, "\n")

# sample.merge(sample.groupby(['hadm_id', 'study_id'])[['days']].reset_index(), on = 'days')

In [None]:
# sample.groupby(['hadm_id', 'study_id'])[['days']].min().reset_index()