## This notebook focuses on the second part of the PatientOmics framework:

The aim is to organizing clinical time-series data for analysis and visualization, inwhich we perform the following tasks:

1. Execute a script to generate the initial matrix containing clinical features for 116 patients, specifically on the first day of hospitalization.

2. Visualize the correlation among laboratory features for potential long COVID patients (N = 116) on their first day of hospitalization, by:
        a. Grouping features based on their laboratory families.
        b. Sorting features by their average similarity distances.

3. Create a cluster map to visualize data patterns among potential long COVID patients (N = 116).

4. Structuring this notebook to add the data preparation functions to transform a pandas dataframe into X and y numpy arrays that can be used to create a TSDataset.
    - https://timeseriesai.github.io/tsai/data.preparation.html



https://github.com/tslearn-team/tslearn

https://github.com/timeseriesAI/tsai

https://github.com/timeseriesAI/tsai/blob/main/tutorial_nbs/15_PatchTST_a_new_transformer_for_LTSF.ipynb

## Get the clinical features for all patients at one time point (e.g. 1st day of hospitalization)

In [3]:
## Get the clinical features for all patients at one time point (e.g. 1st day of hospitalization)

import os
import pandas as pd

# Read the list of unique patient IDs
dir_path = '/home/jagh/Documents/01_UB/MultiOmiX/patientomics/data/05_data_exploration/01_preprocessing_116_PLCP/'
filename = os.path.join(dir_path, '116_plcp_mv_pseudoid_pid.csv')
df = pd.read_csv(filename, sep=',', header=0)

# Initialize an empty list to store patient DataFrames
patient_dfs = []

## Set the day to be selected
days_col = 7  

# Iterate through the patient IDs
for patient_id in df['pseudoid_pid']:
    dir_lab_path = '/home/jagh/Documents/01_UB/MultiOmiX/patientomics/data/06_clinical_data/lab_data_features/'
    filename = os.path.join(dir_lab_path, f'patient_{patient_id}.csv')

    try:
        # Read the CSV file into a DataFrame for each patient
        df_lab = pd.read_csv(filename, sep=',', header=0)

        # Get the row features for a specific day in 'days' column (e.g., 'days' == 0)
        df_lab_selected_day = df_lab[df_lab['days'] == days_col].copy()

        # Add a column 'patient_id' into the df_lab_selected_day
        df_lab_selected_day['patient_id'] = patient_id

        # Append the patient DataFrame to the list
        patient_dfs.append(df_lab_selected_day)

    except Exception as e:
    # except FileNotFoundError:
        print(f"Error reading patient {patient_id}: {str(e)}")
        continue

# Concatenate all patient DataFrames into one
collection_df = pd.concat(patient_dfs, ignore_index=True)

# Save the resulting DataFrame as a CSV file
output_csv_path = dir_path + '116_plcp_lab_markers_day-' + str(days_col) +'.csv'
collection_df.to_csv(output_csv_path, index=False)

## Baseline method for data imputation for missing values from a range of days

In [2]:
import os
import pandas as pd

def impute_missing_values(df_lab_selected_day, df_lab_days, columns_to_impute):
    """Function for data imputation using substitution method"""

    imputed_values = []  # Store imputed values
    
    for column in columns_to_impute:
        if not df_lab_selected_day[column].isna().all():
            for index, row in df_lab_selected_day.iterrows():
                if pd.isna(row[column]):
                    # Find the nearest available value from days after the missing day
                    next_day_values = df_lab_days[df_lab_days['days'] > row['days']].sort_values(by='days')
                    if not next_day_values.empty:
                        next_day_value = next_day_values.iloc[0][column]
                        df_lab_selected_day.at[index, column] = next_day_value
                        imputed_values.append(next_day_value)
                    else:
                        # If no values are available from days after, use the nearest available value from days before
                        prev_day_values = df_lab_days[df_lab_days['days'] < row['days']].sort_values(by='days', ascending=False)
                        if not prev_day_values.empty:
                            prev_day_value = prev_day_values.iloc[0][column]
                            df_lab_selected_day.at[index, column] = prev_day_value
                            imputed_values.append(prev_day_value)
    
    return df_lab_selected_day, imputed_values

# Read the list of unique patient IDs
dir_path = '/home/jagh/Documents/01_UB/MultiOmiX/patientomics/data/05_data_exploration/01_preprocessing_116_PLCP/'
filename = os.path.join(dir_path, '116_plcp_mv_pseudoid_pid.csv')
df = pd.read_csv(filename, sep=',', header=0)

# Initialize an empty list to store patient DataFrames
patient_dfs = []

# Set the day range to be selected
init_day = 0
end_day = 20

# Initialize a list to store all imputed values
all_imputed_values = []

# Iterate through the patient IDs
for patient_id in df['pseudoid_pid']:
    dir_lab_path = '/home/jagh/Documents/01_UB/MultiOmiX/patientomics/data/06_clinical_data/lab_data_features/'
    filename = os.path.join(dir_lab_path, f'patient_{patient_id}.csv')

    try:
        # Read the CSV file into a DataFrame for each patient
        df_lab = pd.read_csv(filename, sep=',', header=0)

        # Get the row features from 'df_lab' for the selected day range
        df_lab_days = df_lab[df_lab['days'].between(init_day, end_day)].copy()

        # Get the columns to impute from the dataframe df_lab_days
        columns_to_impute = df_lab_days.columns[2:-1].tolist()

        # Iterate through the selected days
        for selected_day in range(init_day, end_day + 1):
            # Get the row features for a specific day in 'days' column
            df_lab_day_selected = df_lab_days[df_lab_days['days'] == selected_day].copy()

            # Apply data imputation for the selected day
            df_lab_day_selected, imputed_values = impute_missing_values(df_lab_day_selected, df_lab_days, columns_to_impute)
            all_imputed_values.extend(imputed_values)  # Append imputed values to the list

            # Add a column 'patient_id' into the df_lab_day_selected
            df_lab_day_selected['patient_id'] = patient_id

            # Append the patient DataFrame to the list
            patient_dfs.append(df_lab_day_selected)

    except Exception as e:
        print(f"Error reading patient {patient_id}: {str(e)}")
        continue

# Concatenate all patient DataFrames into one
collection_df = pd.concat(patient_dfs, ignore_index=True)

# Save the resulting DataFrame as a CSV file
output_csv_path = os.path.join(dir_path, f'116_plcp_lab_markers_day_Imputed-{init_day}_to_{end_day}.csv')
collection_df.to_csv(output_csv_path, index=False)

# Save the imputed values to a separate file if needed
imputed_values_file = os.path.join(dir_path, f'116_plcp_lab_imputed_values.csv')
pd.Series(all_imputed_values).to_csv(imputed_values_file, index=False, header=['Imputed_Values'])



#######################################################################################################################
######################################################################################################################

## Script for creating TSDataset based on hospitalization days

In [13]:
import os
import pandas as pd

def impute_missing_values(df_lab_selected_day, df_lab_days, columns_to_impute):
    """Function for data imputation using substitution method"""
    
    imputed_values = []  # Store imputed values
    
    for column in columns_to_impute:
        if not df_lab_selected_day[column].isna().all():
            for index, row in df_lab_selected_day.iterrows():
                if pd.isna(row[column]):
                    # Find the nearest available value from days after the missing day
                    next_day_values = df_lab_days[df_lab_days['days'] > row['days']].sort_values(by='days')
                    if not next_day_values.empty:
                        next_day_value = next_day_values.iloc[0][column]
                        df_lab_selected_day.at[index, column] = next_day_value
                        imputed_values.append(next_day_value)
                    else:
                        # If no values are available from days after, use the nearest available value from days before
                        prev_day_values = df_lab_days[df_lab_days['days'] < row['days']].sort_values(by='days', ascending=False)
                        if not prev_day_values.empty:
                            prev_day_value = prev_day_values.iloc[0][column]
                            df_lab_selected_day.at[index, column] = prev_day_value
                            imputed_values.append(prev_day_value)
    
    return df_lab_selected_day, imputed_values

# Read the list of unique patient IDs
dir_path = '/home/jagh/Documents/01_UB/MultiOmiX/patientomics/data/05_data_exploration/01_preprocessing_116_PLCP/03_dataset/'
# filename = os.path.join(dir_path, 'longcovid_patients_pseudoid_pid-train.csv')
# filename = os.path.join(dir_path, 'deceased_patients_pseudoid_pid-train.csv')

# filename = os.path.join(dir_path, 'longcovid_patients_pseudoid_pid-valid.csv')
# filename = os.path.join(dir_path, 'deceased_patients_pseudoid_pid-valid.csv')

# filename = os.path.join(dir_path, 'longcovid_patients_pseudoid_pid-test.csv')
filename = os.path.join(dir_path, 'deceased_patients_pseudoid_pid-test.csv')


df = pd.read_csv(filename, sep=',', header=0)

# Initialize an empty list to store patient DataFrames
patient_dfs = []

# Set the day range to be selected
init_day = 0
end_day = 15

# Initialize a list to store all imputed values
all_imputed_values = []

# Iterate through the patient IDs
for patient_id in df['pseudoid_pid']:
    dir_lab_path = '/home/jagh/Documents/01_UB/MultiOmiX/patientomics/data/06_clinical_data/lab_data_features/'
    filename = os.path.join(dir_lab_path, f'patient_{patient_id}.csv')

    try:
        # Read the CSV file into a DataFrame for each patient
        df_lab = pd.read_csv(filename, sep=',', header=0)

        # Get the row features from 'df_lab' for the selected day range
        df_lab_days = df_lab[df_lab['days'].between(init_day, end_day)].copy()

        # Get the columns to impute from the dataframe df_lab_days
        columns_to_impute = df_lab_days.columns[2:-1].tolist()

        # Iterate through the selected days
        for selected_day in range(init_day, end_day + 1):
            # Get the row features for a specific day in 'days' column
            df_lab_day_selected = df_lab_days[df_lab_days['days'] == selected_day].copy()

            if df_lab_day_selected.empty:
                # Create an empty row with the selected_day and patient_id
                empty_data = {'days': [selected_day], 'patient_id': [patient_id]}
                for col in columns_to_impute:
                    empty_data[col] = [None]
                df_lab_day_selected = pd.DataFrame(empty_data)

            # Apply data imputation for the selected day
            df_lab_day_selected, imputed_values = impute_missing_values(df_lab_day_selected, df_lab_days, columns_to_impute)
            all_imputed_values.extend(imputed_values)  # Append imputed values to the list

            # Add a column 'patient_id' into the df_lab_day_selected
            df_lab_day_selected['patient_id'] = patient_id

            # Append the patient DataFrame to the list
            patient_dfs.append(df_lab_day_selected)

    except Exception as e:
        print(f"Error reading patient {patient_id}: {str(e)}")
        continue

# Concatenate all patient DataFrames into one
collection_df = pd.concat(patient_dfs, ignore_index=True)

# Save the resulting DataFrame as a CSV file
# output_csv_path = os.path.join(dir_path, f'tsdataset_plcp_train-{init_day}_to_{end_day}.csv')
# output_csv_path = os.path.join(dir_path, f'tsdataset_dcp_train-{init_day}_to_{end_day}.csv')

# output_csv_path = os.path.join(dir_path, f'tsdataset_plcp_valid-{init_day}_to_{end_day}.csv')
# output_csv_path = os.path.join(dir_path, f'tsdataset_dcp_valid-{init_day}_to_{end_day}.csv')

# output_csv_path = os.path.join(dir_path, f'tsdataset_plcp_test-{init_day}_to_{end_day}.csv')
output_csv_path = os.path.join(dir_path, f'tsdataset_dcp_test-{init_day}_to_{end_day}.csv')
collection_df.to_csv(output_csv_path, index=False)

## Load the multivariate-time-point matrix and group lab features by their laboratory family


In [14]:
import os
import pandas as pd

## Load the list of laboratory features
dir_path = '/home/jagh/Documents/01_UB/MultiOmiX/patientomics/data/05_data_exploration/01_preprocessing_116_PLCP/03_dataset/'
filename = os.path.join(dir_path, '00_lab_parameter_grouping.csv')
df_lab_features = pd.read_csv(filename, sep=',', header=0)


## Read the list of unique patient IDs
dir_path = '/home/jagh/Documents/01_UB/MultiOmiX/patientomics/data/05_data_exploration/01_preprocessing_116_PLCP/03_dataset/'
# filename = os.path.join(dir_path, 'tsdataset_plcp_train-0_to_15.csv')
# filename = os.path.join(dir_path, 'tsdataset_dcp_train-0_to_15.csv')

# filename = os.path.join(dir_path, 'tsdataset_plcp_valid-0_to_15.csv')
# filename = os.path.join(dir_path, 'tsdataset_dcp_valid-0_to_15.csv')

# filename = os.path.join(dir_path, 'tsdataset_plcp_test-0_to_15.csv')
filename = os.path.join(dir_path, 'tsdataset_dcp_test-0_to_15.csv')

df = pd.read_csv(filename, sep=',', header=0)


## Set the 'df' columns in the order of the 'df_lab_features'
df = df.reindex(columns=df_lab_features['lab_parameter'].tolist())
# print(df.head(10))


################################################
### Transform the 'df' into a time-series dataset
df_copy = df.copy() 

## Transpose the 'df[:, 2:-1]' to get the laboratory features as rows per patient and set the 'patient_id' column as row index
df_transposed = df_copy.melt(id_vars=['patient_id'], value_vars=df_copy.columns[2:-1], var_name='lab_parameter', value_name='lab_value')

## Set all 'lab_value' of the same lab_parameter in on row per patient in a sparse dataframe
df_transposed = df_transposed.groupby(['patient_id', 'lab_parameter'])['lab_value'].apply(list).reset_index()
print(df_transposed.head(10))

## save the resulting DataFrame as a CSV file
# output_csv_path = os.path.join(dir_path, f'tsdataset_plcp_train-0_to_15_ts.csv')
# output_csv_path = os.path.join(dir_path, f'tsdataset_dcp_train-0_to_15_ts.csv')

# output_csv_path = os.path.join(dir_path, f'tsdataset_plcp_valid-0_to_15_ts.csv')
# output_csv_path = os.path.join(dir_path, f'tsdataset_dcp_valid-0_to_15_ts.csv')


# output_csv_path = os.path.join(dir_path, f'tsdataset_plcp_test-0_to_15_ts.csv')
output_csv_path = os.path.join(dir_path, f'tsdataset_dcp_test-0_to_15_ts.csv')

df_transposed.to_csv(output_csv_path, index=False)

   patient_id        lab_parameter  \
0     1000766                 ALAT   
1     1000766                 ASAT   
2     1000766            Basophile   
3     1000766      Bicarbonat Std.   
4     1000766      Bicarbonat akt.   
5     1000766  C-reaktives Protein   
6     1000766          Eosinophile   
7     1000766         Erythrozyten   
8     1000766    Fibrinogen Clauss   
9     1000766            Harnstoff   

                                           lab_value  
0  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...  
1  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...  
2  [0.01, 0.01, nan, 0.02, nan, 0.02, 0.03, nan, ...  
3  [nan, 21.8, nan, nan, nan, nan, 17.6, nan, 20....  
4  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...  
5  [88.0, 98.0, nan, 102.0, nan, 149.0, 103.0, na...  
6  [0.01, 0.01, nan, 0.02, nan, 0.05, 0.04, nan, ...  
7  [3.38, 3.26, nan, 3.24, nan, 3.24, 3.03, nan, ...  
8  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...  
9  [nan, 24.5, nan, nan, nan, 2