<a href="https://colab.research.google.com/github/humb3rt84/UT/blob/main/MLDL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Load the Required Data
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Load ICUSTAYS dataset
icustays_df = pd.read_csv('/content/drive/My Drive/MIMICfull/ICUSTAYS.csv')
print("ICUSTAYS Columns:")
print(icustays_df.columns)
print(icustays_df.head())

# Load ADMISSIONS dataset
admissions_df = pd.read_csv('/content/drive/My Drive/MIMICfull/ADMISSIONS.csv')
print("ADMISSIONS Columns:")
print(admissions_df.columns)
print(admissions_df.head())

# Load NOTEEVENTS dataset (loading only a subset for initial exploration)
noteevents_df = pd.read_csv('/content/drive/My Drive/MIMICfull/NOTEEVENTS.csv', low_memory=False)
print("NOTEEVENTS Columns:")
print(noteevents_df.columns)
print(noteevents_df.head())

Mounted at /content/drive
ICUSTAYS Columns:
Index(['row_id', 'subject_id', 'hadm_id', 'icustay_id', 'dbsource',
       'first_careunit', 'last_careunit', 'first_wardid', 'last_wardid',
       'intime', 'outtime', 'los'],
      dtype='object')
   row_id  subject_id  hadm_id  icustay_id dbsource first_careunit  \
0       1           2   163353      243653  carevue           NICU   
1       2           3   145834      211552  carevue           MICU   
2       4           5   178980      214757  carevue           NICU   
3       6           7   118037      278444  carevue           NICU   
4       7           7   118037      236754  carevue           NICU   

  last_careunit  first_wardid  last_wardid               intime  \
0          NICU            56           56  2138-07-17 21:20:07   
1          MICU            12           12  2101-10-20 19:10:11   
2          NICU            56           56  2103-02-02 06:04:24   
3          NICU            56           56  2121-05-23 15:35:29   
4

In [None]:
# Load ICUSTAYS dataset
# icustays_df = pd.read_csv('/content/drive/My Drive/MIMICfull/ICUSTAYS.csv')
num_records_icustays = icustays_df.shape[0]
print(f"Number of records loaded for icustays_df: {num_records_icustays}")

# Load ADMISSIONS dataset
# admissions_df = pd.read_csv('/content/drive/My Drive/MIMICfull/ADMISSIONS.csv')
num_records_admissions = admissions_df.shape[0]
print(f"Number of records loaded for admissions_df: {num_records_admissions}")

# Step 2: Merge ICUSTAYS with ADMISSIONS on subject_id and hadm_id
merged_icustays_admissions = pd.merge(icustays_df, admissions_df, on=['subject_id', 'hadm_id'], how='inner')
num_records_merged_icustays_admissions = merged_icustays_admissions.shape[0]
print(f"Number of records loaded for merged_icustays_admissions: {num_records_merged_icustays_admissions}")


Number of records loaded for icustays_df: 28391
Number of records loaded for admissions_df: 26836
Number of records loaded for merged_icustays_admissions: 28391


In [None]:
# Step 3: Filter CHARTEVENTS for Relevant ITEMIDs and Merge

# Load and filter CHARTEVENTS dataset more efficiently
filtered_chartevents = pd.read_csv('/content/drive/My Drive/MIMICsubset/CHARTEVENTS.csv', usecols=['subject_id', 'icustay_id', 'itemid', 'charttime', 'valuenum'])
num_records_filtered_chartevents = filtered_chartevents.shape[0]
print(f"Number of records loaded for admissions_df: {num_records_filtered_chartevents}")

# Ensure that subject_id and icustay_id are of the same type in both DataFrames
merged_icustays_admissions['subject_id'] = merged_icustays_admissions['subject_id'].astype(int)
merged_icustays_admissions['icustay_id'] = merged_icustays_admissions['icustay_id'].astype(float)
filtered_chartevents['subject_id'] = filtered_chartevents['subject_id'].astype(int)
filtered_chartevents['icustay_id'] = filtered_chartevents['icustay_id'].astype(float)

# Merge the filtered CHARTEVENTS with merged_icustays_admissions on subject_id and icustay_id
merged_with_chartevents = pd.merge(merged_icustays_admissions, filtered_chartevents, on=['subject_id', 'icustay_id'], how='inner')
num_records_merged_with_chartevents = merged_with_chartevents.shape[0]
print(f"Number of records loaded for merged_with_chartevents: {num_records_merged_with_chartevents}")


Number of records loaded for admissions_df: 758355
Number of records loaded for merged_with_chartevents: 359882


In [None]:
# Step 4.1: Aggregate Vital Signs
# Aggregate vital sign values for each ICU stay (e.g., average and max values)
aggregated_vitals = merged_with_chartevents.groupby('icustay_id').agg(
    mean_valuenum=('valuenum', 'mean'),
    max_valuenum=('valuenum', 'max'),
    min_valuenum=('valuenum', 'min')
).reset_index()
print("Aggregated Vital Signs:")
print(aggregated_vitals.head())

num_records_aggregated_vitals = aggregated_vitals.shape[0]
print(f"Number of records loaded for aggregated_vitals: {num_records_aggregated_vitals}")


Aggregated Vital Signs:
   icustay_id  mean_valuenum  max_valuenum  min_valuenum
0    201006.0      73.526690   2000.000000         -10.0
1    203766.0      87.808804   5948.720215         -17.0
2    204201.0      52.911634    285.000000           0.4
3    204881.0      64.937683    270.000000           0.3
4    206504.0      63.693601    246.000000           1.0
Number of records loaded for aggregated_vitals: 56


In [None]:
import re

# Define a list of keywords related to delirium that we want to identify in clinical notes
delirium_keywords = ["delirium", "confusion", "disorientation", "agitation", "hallucination", "altered mental status"]

# Create a function to check if any delirium-related keywords are present in the text
def identify_delirium(text):
    if isinstance(text, str):
        for keyword in delirium_keywords:
            if re.search(r'\b' + keyword + r'\b', text, re.IGNORECASE):
                return 1  # Label as 1 if any keyword is found
    return 0  # Label as 0 otherwise

# Apply the function to create a 'delirium_label' column
noteevents_df['delirium_label'] = noteevents_df['text'].apply(identify_delirium)
#print("NOTEEVENTS with Delirium Label:")
#print(noteevents_df[['subject_id', 'hadm_id', 'text', 'delirium_label']].head())

num_records_noteevents_df = noteevents_df.shape[0]
print(f"Number of records loaded for noteevents_df: {num_records_noteevents_df}")


Number of records loaded for noteevents_df: 880107


In [None]:
# Step 4.3: Merge Aggregated Vitals and Clinical Notes
# Merge the aggregated vitals with the merged ICUSTAYS and ADMISSIONS dataframe
merged_features = pd.merge(merged_icustays_admissions, aggregated_vitals, on='icustay_id', how='left')

# Merge with NOTEEVENTS to include delirium labels
final_merged_df = pd.merge(merged_features, noteevents_df[['subject_id', 'hadm_id', 'delirium_label']], on=['subject_id', 'hadm_id'], how='left')

num_records_final_merged_df = final_merged_df.shape[0]
print(f"Number of records loaded for final_merged_df: {num_records_final_merged_df}")

Number of records loaded for final_merged_df: 952027


In [None]:
# Step 5: Data Cleaning
import pandas as pd

# Assuming final_merged_df is properly defined as a DataFrame above this code

# Step 5.1: Impute missing vital signs with their respective means
final_merged_df.loc[:, 'mean_valuenum'] = final_merged_df['mean_valuenum'].fillna(final_merged_df['mean_valuenum'].mean())
final_merged_df.loc[:, 'max_valuenum'] = final_merged_df['max_valuenum'].fillna(final_merged_df['max_valuenum'].mean())
final_merged_df.loc[:, 'min_valuenum'] = final_merged_df['min_valuenum'].fillna(final_merged_df['min_valuenum'].mean())

# Step 5.2: Impute missing delirium labels with 0 (assuming no delirium)
final_merged_df.loc[:, 'delirium_label'] = final_merged_df['delirium_label'].fillna(0)

# Step 5.3: Impute categorical missing values with 'Unknown'
final_merged_df.loc[:, 'language'] = final_merged_df['language'].fillna('Unknown')
final_merged_df.loc[:, 'religion'] = final_merged_df['religion'].fillna('Unknown')
final_merged_df.loc[:, 'marital_status'] = final_merged_df['marital_status'].fillna('Unknown')

# Step 5.4: Fill missing deathtime values with 0 (indicating survival)
final_merged_df.loc[:, 'deathtime'] = final_merged_df['deathtime'].fillna(0)

# Step 5.5: Drop rows with missing 'outtime' or 'los' if they exist in DataFrame
missing_columns = [col for col in ['outtime', 'los'] if col in final_merged_df.columns]
if missing_columns:
    final_merged_df = final_merged_df.dropna(subset=missing_columns)

# Step 5.6: Impute missing 'diagnosis' values with 'Unknown'
final_merged_df.loc[:, 'diagnosis'] = final_merged_df['diagnosis'].fillna('Unknown')

# Step 5.7: Dropping unnecessary columns if they exist in DataFrame
unnecessary_columns = [col for col in ['edregtime', 'edouttime'] if col in final_merged_df.columns]
if unnecessary_columns:
    final_merged_df = final_merged_df.drop(columns=unnecessary_columns)

# Ensuring final_merged_df is still a DataFrame and not accidentally assigned to a method
assert isinstance(final_merged_df, pd.DataFrame), "final_merged_df must be a DataFrame"

print(final_merged_df.info())
print(final_merged_df.head())

delirium_count = final_merged_df[final_merged_df['delirium_label'] == 1].shape[0]
print(f"Number of rows with delirium = 1: {delirium_count}")


<class 'pandas.core.frame.DataFrame'>
Index: 951647 entries, 0 to 952026
Data columns (total 31 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   row_id_x              951647 non-null  int64  
 1   subject_id            951647 non-null  int64  
 2   hadm_id               951647 non-null  int64  
 3   icustay_id            951647 non-null  float64
 4   dbsource              951647 non-null  object 
 5   first_careunit        951647 non-null  object 
 6   last_careunit         951647 non-null  object 
 7   first_wardid          951647 non-null  int64  
 8   last_wardid           951647 non-null  int64  
 9   intime                951647 non-null  object 
 10  outtime               951647 non-null  object 
 11  los                   951647 non-null  float64
 12  row_id_y              951647 non-null  int64  
 13  admittime             951647 non-null  object 
 14  dischtime             951647 non-null  object 
 15  death

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 6: Extract Features for Modeling
# Extract relevant features for modeling, including the delirium label
modeling_features_df = final_merged_df[['icustay_id', 'subject_id', 'mean_valuenum', 'max_valuenum', 'min_valuenum', 'delirium_label']]

# Drop duplicates to ensure no redundant data
modeling_features_df = modeling_features_df.drop_duplicates(subset=['icustay_id'])

# Step 7: Data Splitting for Model Training
# Define features (X) and target (y)
X = modeling_features_df.drop(columns=['subject_id', 'icustay_id', 'delirium_label'])
y = modeling_features_df['delirium_label']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Model Training - Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_model.fit(X_train, y_train)

# Step 9: Model Evaluation
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))



Model Accuracy: 0.01620574246961423
Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.00      0.00      5593
         1.0       0.01      0.99      0.03        84

    accuracy                           0.02      5677
   macro avg       0.46      0.49      0.02      5677
weighted avg       0.89      0.02      0.00      5677



In [None]:
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1



In [None]:
from imblearn.over_sampling import SMOTE

# Step 6: Extract Features for Modeling
# Extract relevant features for modeling, including the delirium label
modeling_features_df = final_merged_df[['icustay_id', 'subject_id', 'mean_valuenum', 'max_valuenum', 'min_valuenum', 'delirium_label']]

# Drop duplicates to ensure no redundant data
modeling_features_df = modeling_features_df.drop_duplicates(subset=['icustay_id'])

# Step 7: Data Splitting for Model Training
# Define features (X) and target (y)
X = modeling_features_df.drop(columns=['subject_id', 'icustay_id', 'delirium_label'])
y = modeling_features_df['delirium_label']

# Step 7.1: Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 8: Model Training - Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 9: Model Evaluation
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.5001340842048807
Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67      5577
         1.0       0.95      0.00      0.01      5610

    accuracy                           0.50     11187
   macro avg       0.72      0.50      0.34     11187
weighted avg       0.73      0.50      0.34     11187



In [None]:
!pip install xgboost
!pip install imbalanced-learn

Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.2
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m 

In [None]:
!pip install xgboost
!pip install imbalanced-learn

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import RandomizedSearchCV

# Step 6: Extract Features for Modeling
# Use all available features except 'subject_id' and 'icustay_id' as they are identifiers
modeling_features_df = final_merged_df.drop(columns=['subject_id', 'icustay_id'])

# Drop duplicates to ensure no redundant data
modeling_features_df = modeling_features_df.drop_duplicates()

# Step 7: Encoding Categorical Features
# Convert categorical columns to numeric using one-hot encoding
modeling_features_df = pd.get_dummies(modeling_features_df, drop_first=True)

# Convert numeric columns to float32 to save memory
for col in modeling_features_df.select_dtypes(include=['float64']).columns:
    modeling_features_df[col] = modeling_features_df[col].astype('float32')

# Step 8: Data Splitting for Model Training
X = modeling_features_df.drop(columns=['delirium_label'])
y = modeling_features_df['delirium_label']

# Feature Selection to reduce dimensionality
k = 20  # Choose a suitable number of features based on available resources
selector = SelectKBest(score_func=f_classif, k=k)
X = selector.fit_transform(X, y)

# Applying SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 9: Hyperparameter Tuning - XGBoost Classifier
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Use RandomizedSearchCV to reduce memory usage
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_grid, n_iter=10, cv=3, n_jobs=-1, verbose=0, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Best parameters from random search
best_params = random_search.best_params_
print(f"Best Parameters: {best_params}")

# Step 10: Model Evaluation
# Train the best model with the entire training set
best_xgb_model = random_search.best_estimator_
y_pred = best_xgb_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))



Best Parameters: {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
Model Accuracy: 0.7722545390445972
Classification Report:
              precision    recall  f1-score   support

         0.0       0.74      0.83      0.79      5669
         1.0       0.81      0.71      0.76      5677

    accuracy                           0.77     11346
   macro avg       0.78      0.77      0.77     11346
weighted avg       0.78      0.77      0.77     11346

