In [5]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
data_path = r"/home/jori152b/DIR/horse/jori152b-medinf/KP_MedInf/model_development/data"

In [11]:
# xgb_results = np.load(os.path.join(data_path, 'models/aki_stage_X_extended_6H.csv_20240915142850/results.npy'), allow_pickle=True)
# load as dict
xgb_results = np.load(os.path.join(data_path, 'models/aki_stage_X_extended_6H.csv_20240915142850/results.npy'), allow_pickle=True).item()

In [None]:
print(xgb_results['aki_stage_X_extended_6H.csv']['average_scores'])

In [None]:
stats = df.describe()
print(stats)    
# save to csv
stats.to_csv('data/analysis/data_preprocessed_extended_stats.csv', index=False)

In [None]:
# Initialize a dictionary to store results
results = {}

# List of columns to analyze (excluding the datetime and other non-measurement columns)
columns_to_analyze = df.columns.difference(['icustay_id', 'charttime', 'hadm_id', 'subject_id'])

# Group by icustay_id
grouped = df.groupby('icustay_id')

# Iterate through each column to analyze
for column in columns_to_analyze:
    frequencies = []
    mean_time_diffs = []
    std_time_diffs = []
    max_time_diffs = []
    min_time_diffs = []

    # Iterate through each group (each icustay_id)
    for icustay_id, group in grouped:
        valid_rows = group[group[column].notna()]

        frequency = valid_rows.shape[0]
        if frequency > 1:
            time_diffs = valid_rows['charttime'].diff().dropna().dt.total_seconds() / 60  # in minutes

            mean_time_diff = time_diffs.mean()
            std_time_diff = time_diffs.std()
            max_time_diff = time_diffs.max()
            min_time_diff = time_diffs.min()
        else:
            mean_time_diff = std_time_diff = max_time_diff = min_time_diff = None

        frequencies.append(frequency)
        mean_time_diffs.append(mean_time_diff)
        std_time_diffs.append(std_time_diff)
        max_time_diffs.append(max_time_diff)
        min_time_diffs.append(min_time_diff)

    # Aggregate the statistics across all icustay_id groups
    overall_frequency = sum(frequencies)
    overall_mean_time_diff = pd.Series(mean_time_diffs).mean()
    overall_std_time_diff = pd.Series(std_time_diffs).mean()
    overall_max_time_diff = pd.Series(max_time_diffs).max()
    overall_min_time_diff = pd.Series(min_time_diffs).min()

    results[column] = {
        'frequency': overall_frequency,
        'mean_time_diff': overall_mean_time_diff,
        'std_time_diff': overall_std_time_diff,
        'max_time_diff': overall_max_time_diff,
        'min_time_diff': overall_min_time_diff
    }

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results).T
print(results_df)

# Optionally, save the results to a CSV file
results_df.to_csv('../data/analysis/measurement_statistics_by_icustay.csv')


In [None]:
# drop all the columns in string format
X.drop(['first_hosp_stay'], axis=1, inplace = True)
X.drop(['first_icu_stay'], axis=1, inplace = True)
X.drop(['ethnicity'], axis=1, inplace = True)
X.drop(['admittime'], axis=1, inplace = True)
X.drop(['dischtime'], axis=1, inplace = True)
X.drop(['intime'], axis=1, inplace = True)
X.drop(['outtime'], axis=1, inplace = True)
X.drop(['dod'], axis=1, inplace = True)
X.drop(['charttime'], axis=1, inplace = True)


In [None]:
X_original = pd.read_csv(os.path.join(data_path, 'preprocessed/X_original.csv'), sep=',')
X_original.drop(['charttime', 'hadm_id', 'icustay_id'], axis=1, inplace=True)
# remove all "_mean" from the column names if there is any
X_original.columns = X_original.columns.str.replace('_mean', '')
print(X_original.columns)

X_extended = pd.read_csv(os.path.join(data_path, 'preprocessed/X_extended.csv'), sep=',')
X_extended.drop(['charttime', 'hadm_id', 'icustay_id'], axis=1, inplace=True)
# remove all "_mean" from the column names if there is any
X_extended.columns = X_extended.columns.str.replace('_mean', '')
print(X_extended.columns)



In [None]:
print('difference between original and extended: ', set(X_original.columns) - set(X_extended.columns))
print('difference between extended and original: ', set(X_extended.columns) - set(X_original.columns))
new_attributes = list(set(X_extended.columns) - set(X_original.columns))
print('new attributes: ', new_attributes)


In [None]:
import matplotlib.pyplot as plt

# Attributes only in X_original (ethnicity groupings)
original_only_attributes = [
    'ethnicity_grouped_white', 'ethnicity_grouped_unknown', 'ethnicity_grouped_native',
    'ethnicity_grouped_black', 'ethnicity_grouped_hispanic', 'ethnicity_grouped_other',
    'ethnicity_grouped_asian'
]

# Create a copy of X_original with only these attributes and aki_stage
X_original_subset = X_original[original_only_attributes + ['aki_stage']].copy()

# Rename the columns
new_column_names = {
    'ethnicity_grouped_white': 'White',
    'ethnicity_grouped_unknown': 'Unknown',
    'ethnicity_grouped_native': 'Native',
    'ethnicity_grouped_black': 'Black',
    'ethnicity_grouped_hispanic': 'Hispanic',
    'ethnicity_grouped_other': 'Other',
    'ethnicity_grouped_asian': 'Asian'
}
X_original_subset.rename(columns=new_column_names, inplace=True)

# Calculate correlations
correlations = X_original_subset.corr()['aki_stage'].drop('aki_stage').sort_values(ascending=False)

# Create bar chart
plt.figure(figsize=(20,10))
correlations.plot(kind='bar', color='grey')
plt.title('Correlation of Ethnicity with AKI Stage', fontsize=20)
plt.ylabel('Correlation', fontsize=16)
plt.xlabel('Ethnicity', fontsize=16)

# Rotate x-axis labels
plt.xticks(rotation=45, ha='right', fontsize=14)
plt.yticks(fontsize=14)

plt.tick_params(axis='both', which='major', labelsize=19)

# Adjust layout to prevent cutoff of labels
plt.tight_layout()

# Add some padding at the bottom for the rotated labels
plt.subplots_adjust(bottom=0.2)

plt.show()

In [None]:
new_attributes = list(set(X_extended.columns) - set(X_original.columns))

print(X_extended.columns)
X = X_extended.copy()[new_attributes + ['aki_stage']]
# take only the new attributes
# # Calculate correlations
correlations = X.corr()['aki_stage'].drop('aki_stage').sort_values(ascending=False)

# Create bar chart
plt.figure(figsize=(20,10))
correlations.plot(kind='bar', color='grey')
plt.title('Correlation of all features with the target variable')
plt.ylabel('Correlation')

# Rotate x-axis labels
plt.xticks(rotation=45, ha='right')

# Adjust layout to prevent cutoff of labels
plt.tight_layout()

plt.show()

In [None]:
# Dictionary to map old names to new names
name_mapping = {
    'albumin': 'Albumin',
    'lactate': 'Lactate',
    'bands': 'Immature WBCs',
    'height_first': 'Initial Height',
    'inr_max': 'Maximum INR',
    'platelet': 'Platelet Count',
    'bilirubin': 'Bilirubin',
    'ptt': 'Partial Thromboplastin Time',
    'inr': 'INR',
    'weight_first': 'Initial Weight',
    'phosphate': 'Phosphate',
    'uric_acid': 'Uric Acid',
    'pt': 'Prothrombin Time',
    'calcium': 'Calcium'
}

# Create a copy of X_extended
X_renamed = X_extended.copy()

# Rename the columns
X_renamed.rename(columns=name_mapping, inplace=True)

# Update the new_attributes list with the renamed columns
new_attributes_renamed = [name_mapping.get(attr, attr) for attr in new_attributes]

# Use the renamed DataFrame and attribute list for further processing
X = X_renamed[new_attributes_renamed + ['aki_stage']]

# Calculate correlations
correlations = X.corr()['aki_stage'].drop('aki_stage').sort_values(ascending=False)

# Create bar chart
plt.figure(figsize=(20,10))
correlations.plot(kind='bar', color='grey')
plt.title('Correlation of New Features with AKI Stage')
plt.ylabel('Correlation')

# Rotate x-axis labels
plt.xticks(rotation=45, ha='right', fontsize=14)
plt.yticks(fontsize=14)

plt.tick_params(axis='both', which='major', labelsize=19)

# Adjust layout to prevent cutoff of labels
plt.tight_layout()

plt.show()

In [None]:
import matplotlib.pyplot as plt
import io
import pandas as pd

# Read the data
data = io.StringIO('''Wall time,Step,Value
1726512293.467726,1,0.7178572416305542
1726512303.369601,2,0.6912493109703064
1726512312.8535783,3,0.7513433694839478
1726512322.4488628,4,0.7845879793167114
1726512332.2577426,5,0.7722629904747009
1726512341.7285247,6,0.7896556258201599
1726512352.015021,7,0.7530167698860168
1726512361.736987,8,0.7810222506523132
1726512371.3558726,9,0.7355340719223022
1726512381.518271,10,0.7800203561782837
1726512391.6949794,11,0.8160804510116577
1726512401.555636,12,0.7953928112983704
1726512411.5994918,13,0.7935769557952881
1726512421.809925,14,0.8084826469421387
1726512431.9026322,15,0.8044590353965759
1726512441.8436167,16,0.8052487373352051
1726512452.1929953,17,0.7796595692634583
1726512462.5377321,18,0.7876774668693542
1726512473.255793,19,0.7989493608474731
1726512483.9579756,20,0.7957519888877869
1726512494.2151084,21,0.8039131760597229''')

df = pd.read_csv(data)

# Create the plot
plt.figure(figsize=(12, 6))
plt.plot(df['Step'], df['Value'], color='gray', marker='o')

# Customize the plot
plt.title('ROC-AUC Score over Epochs', fontsize=16, color='black')
plt.xlabel('Epoch', fontsize=12, color='black')
plt.ylabel('ROC-AUC Score', fontsize=12, color='black')
plt.grid(True, linestyle='--', alpha=0.7, color='lightgray')

# Set y-axis limits to start from 0.5 for better visualization of ROC-AUC scores
plt.ylim(0.5, 1.0)

# Use a grayscale color map for the background
plt.gca().set_facecolor('#f0f0f0')

# Customize tick colors
plt.tick_params(colors='black')

# Add annotations for the highest and lowest scores
max_score = df['Value'].max()
min_score = df['Value'].min()
max_epoch = df.loc[df['Value'].idxmax(), 'Step']
min_epoch = df.loc[df['Value'].idxmin(), 'Step']

plt.annotate(f'Max: {max_score:.4f}', xy=(max_epoch, max_score), xytext=(5, 5), 
             textcoords='offset points', ha='left', va='bottom',
             bbox=dict(boxstyle='round,pad=0.5', fc='white', ec='gray', alpha=0.7),
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

plt.annotate(f'Min: {min_score:.4f}', xy=(min_epoch, min_score), xytext=(5, -5), 
             textcoords='offset points', ha='left', va='top',
             bbox=dict(boxstyle='round,pad=0.5', fc='white', ec='gray', alpha=0.7),
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

In [7]:
xgb_results = np.load("/home/jori152b/DIR/horse/jori152b-medinf/KP_MedInf/model_development/data/models/xgb_cross_validation_results.npy", allow_pickle=True).item()

In [None]:
xgb_results

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract time points and scores
time_points = ['1H', '2H', '4H', '6H', '8H', '12H', '24H']
brier_scores_original = []
brier_scores_extended = []
roc_auc_scores_original = []
roc_auc_scores_extended = []
accuracy_scores_original = []
accuracy_scores_extended = []

for time_point in time_points:
    original_key = f'aki_stage_X_original_{time_point}.csv'
    extended_key = f'aki_stage_X_extended_{time_point}.csv'
    
    brier_scores_original.append(xgb_results[original_key]['average_scores']['val_brier'])
    brier_scores_extended.append(xgb_results[extended_key]['average_scores']['val_brier'])
    roc_auc_scores_original.append(xgb_results[original_key]['average_scores']['val_roc_auc'])
    roc_auc_scores_extended.append(xgb_results[extended_key]['average_scores']['val_roc_auc'])
    accuracy_scores_original.append(xgb_results[original_key]['average_scores']['val_accuracy'])
    accuracy_scores_extended.append(xgb_results[extended_key]['average_scores']['val_accuracy'])
    

# Create the plot
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 10), sharex=True)
plt.style.use('grayscale')

# Brier Score plot
ax1.plot(time_points, brier_scores_original, marker='o', linestyle='-', color='black', label='Original')
ax1.plot(time_points, brier_scores_extended, marker='s', linestyle='--', color='gray', label='Extended')
ax1.set_ylabel('Brier Score')
ax1.set_title('Average Brier Scores')
ax1.legend()
ax1.grid(True, linestyle=':', alpha=0.7)

# ROC-AUC plot
ax2.plot(time_points, roc_auc_scores_original, marker='o', linestyle='-', color='black', label='Original')
ax2.plot(time_points, roc_auc_scores_extended, marker='s', linestyle='--', color='gray', label='Extended')
ax2.set_xlabel('Time Points')
ax2.set_ylabel('ROC-AUC Score')
ax2.set_title('Average ROC-AUC Scores')
ax2.legend()
ax2.grid(True, linestyle=':', alpha=0.7)

ax3.plot(time_points, accuracy_scores_original, marker='o', linestyle='-', color='black', label='Original')
ax3.plot(time_points, accuracy_scores_extended, marker='s', linestyle='--', color='gray', label='Extended')
ax3.set_xlabel('Time Points')
ax3.set_ylabel('Accuracy Score')
ax3.set_title('Average Accuracy Scores')
ax3.legend()
ax3.grid(True, linestyle=':', alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
pd.read_csv(os.path.join(data_path, 'preprocessed/X_original.csv'), sep=',')

In [None]:
# calculate the frequency of each attribute, i.e. the distribution of non nan values
X.apply(lambda x: x.count(), axis=0)

In [4]:
# read np dict
results_lstm = np.load('model_development/notebooks/data/results_LSTM.npy', allow_pickle=True).item()   
results_xgb = np.load('model_development/notebooks/data/results.npy', allow_pickle=True).item()

In [None]:
print(results_lstm)
print(results_xgb)

In [None]:
# Extract dataset names and metrics
datasets = list(results_lstm.keys())
metrics = ['test_accuracy', 'test_roc_auc', 'test_pr_auc']

# Set up the plot
fig, axs = plt.subplots(3, 1, figsize=(12, 18))
fig.suptitle('Comparison of LSTM and XGBoost Models', fontsize=16)

# Plot each metric
for i, metric in enumerate(metrics):
    lstm_values = [results_lstm[dataset][metric] for dataset in datasets]
    xgb_values = [results_xgb[dataset][metric] for dataset in datasets]
    
    x = np.arange(len(datasets))
    width = 0.35
    
    axs[i].bar(x - width/2, lstm_values, width, label='LSTM')
    axs[i].bar(x + width/2, xgb_values, width, label='XGBoost')
    
    axs[i].set_ylabel(metric)
    axs[i].set_title(f'{metric.capitalize()} Comparison')
    axs[i].set_xticks(x)
    axs[i].set_xticklabels(datasets, rotation=45, ha='right')
    axs[i].legend()

    # Add value labels on top of each bar
    for j, v in enumerate(lstm_values):
        axs[i].text(j - width/2, v, f'{v:.3f}', ha='center', va='bottom')
    for j, v in enumerate(xgb_values):
        axs[i].text(j + width/2, v, f'{v:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [6]:
feature_selection_results = np.load('data/optimal_features.npy', allow_pickle=True)

In [8]:
feature_selection_results = np.load('data/feature_importances.npy', allow_pickle=True)


In [None]:
print(feature_selection_results)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

results = feature_selection_results
# Extract data from the results
n_features = [result['n_features'] for result in results]
val_roc_auc = [result['val_roc_auc'] for result in results]
val_pr_auc = [result['val_pr_auc'] for result in results]

# Create the plot
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 16))

# Plot ROC AUC and PR AUC
ax1.plot(n_features, val_roc_auc, 'b-o', label='ROC AUC')
ax1.plot(n_features, val_pr_auc, 'r-o', label='PR AUC')
ax1.set_xlabel('Number of Features')
ax1.set_ylabel('AUC Score')
ax1.set_title('ROC AUC and PR AUC vs Number of Features')
ax1.legend()
ax1.grid(True)

# Find the best performing model (highest ROC AUC)
best_model_index = np.argmax(val_roc_auc)
best_model = results[best_model_index]

# Get top 15 features from the best model
top_features = best_model['sorted_importance']
feature_names = [feature[0] for feature in top_features]
feature_importance = [feature[1] for feature in top_features]

# Plot feature importance for the best model
ax2.barh(range(len(feature_names)), feature_importance, align='center')
ax2.set_yticks(range(len(feature_names)))
ax2.set_yticklabels(feature_names)
ax2.invert_yaxis()  # Labels read top-to-bottom
ax2.set_xlabel('Feature Importance')
ax2.set_title(f'Top Features (Best model with {best_model["n_features"]} features)')

plt.tight_layout()
plt.show()