# Configuration

In [1]:
# Set the subject to work with (use the actual #, not the index)
subject_number = 1

# Set the percentage of subject data to use for train and test sets.
subject_data_percentage = 0.1

# Sequence Length (in seconds)
sequence_length = 30

# Set the predictive horizon (in seconds)
predictive_horizon = 5

assert subject_number > 0, "Subject number must be greater than 0."
assert subject_data_percentage > 0, "Subject data percentage must be greater than 0."
assert predictive_horizon > 0, "Predictive horizon must be greater than 0."
assert sequence_length > 0, "Sequence length must be greater than 0."

# Setup

In [2]:
# Imports
import numpy as np
import pandas as pd
import glob
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.model_selection import train_test_split

In [3]:
# Constants
sequence_length = sequence_length * 100
predictive_horizon = predictive_horizon * 100

# Data Prep

### Import Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Read in files
files = glob.glob('/content/drive/MyDrive/Data/ProcessedData_Subject*.csv')

data_dfs = []

for filepath in files:
    df = pd.read_csv(filepath)

    # Extract subject number from path
    current_subject_number = filepath.split('/')[-1].split('_Subject')[1].split('.')[0].lstrip('0')
    df['Subject Number'] = current_subject_number
    data_dfs.append(df)

    if str(current_subject_number) == str(subject_number): # Early exit condition since we're only running this for 1 subject
        break

# Check
#data_dfs[0].head()

In [6]:
selected_columns = ['Time [s]', 'Pressure [cmH2O]', 'Flow [L/s]', 'V_tidal [L]', 'Subject Number']

# Create a new list to hold the DataFrames with only the selected columns
selected_data_dfs = []

for df in data_dfs:
    # Select only the specified columns
    selected_df = df[selected_columns]
    selected_data_dfs.append(selected_df)

# Check the first few rows of the first selected DataFrame
print(selected_data_dfs[0].head())


       Time [s]  Pressure [cmH2O]  Flow [L/s]  V_tidal [L] Subject Number
0 -1.000000e-02         -3.400773    0.713827     0.299421              1
1  2.275957e-15         -3.400773    0.713827     0.306559              1
2  1.000000e-02         -3.282765    0.659553     0.313426              1
3  2.000000e-02         -3.400773    0.739471     0.320421              1
4  3.000000e-02         -3.325677    0.739471     0.327816              1


### Clean Data

In [7]:
subject_df = selected_data_dfs[subject_number - 1]  # Assign the DataFrame for the chosen subject

def remove_outliers(df, columns):
    z_scores = np.abs(stats.zscore(df[columns]))
    filtered_entries = (z_scores < 4).all(axis=1)  # Using a threshold of 4
    return df[filtered_entries]

def standardize(df, columns):
    scaler = StandardScaler()
    df.loc[:, columns] = scaler.fit_transform(df[columns])
    return df

# Fill missing values with forward fill
subject_df.ffill(inplace=True)

# Define columns to process, excluding 'Time [s]' and 'Subject Number'
columns_to_process = subject_df.columns.drop(['Time [s]', 'Subject Number']).tolist()

# Remove outliers and standardize
subject_df_clean = remove_outliers(subject_df, columns_to_process)
subject_df_standardized = standardize(subject_df_clean, columns_to_process)

# Update subject_df with the processed DataFrame
subject_df = subject_df_standardized

# To check the first few rows of the processed DataFrame
print(subject_df.head())

       Time [s]  Pressure [cmH2O]  Flow [L/s]  V_tidal [L] Subject Number
0 -1.000000e-02         -2.399169    0.879874    -0.774893              1
1  2.275957e-15         -2.399169    0.879874    -0.770238              1
2  1.000000e-02         -2.353036    0.794601    -0.765760              1
3  2.000000e-02         -2.399169    0.920165    -0.761199              1
4  3.000000e-02         -2.369812    0.920165    -0.756377              1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subject_df.ffill(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, columns] = scaler.fit_transform(df[columns])


### Prepare Train / Test Sets

In [8]:
size = int(len(subject_df) * subject_data_percentage)

# Set a seed for reproducibility
seed = 42
np.random.seed(seed)

# Ensure the random start does not make the slice exceed the DataFrame's length
max_start_index = len(subject_df) - size

# Choose a random start point (skipping the first 10 seconds of data)
start_index = np.random.randint(1000, max_start_index)

subset_df = subject_df.iloc[start_index:start_index + size]

train_df, test_df = train_test_split(subset_df, test_size=0.2)

train_df.reset_index()
test_df.reset_index()

# Checks
print("Original subject size: ", len(subject_df))
print("Subset size: ", len(subset_df))
print("Train set size: ", len(train_df))
print("Test set size: ", len(test_df))

Original subject size:  117960
Subset size:  11796
Train set size:  9436
Test set size:  2360


# Generate Sequences

In [9]:
sequence_arrays = []
target_arrays = []

feat_cols = columns_to_process # TODO: Not sure if we want to do predictions on all of these columns
target_cols = ['Flow [L/s]']

for i in range(0, len(train_df) - sequence_length - predictive_horizon):
    sequence_arrays.append(train_df.iloc[i:i + sequence_length][columns_to_process].values)
    target_arrays.append(train_df.iloc[i + sequence_length + predictive_horizon][target_cols].iloc[0])

# Convert to numpy arrays and floats
sequence_arrays = np.array(sequence_arrays, dtype = object).astype(np.float32)
target_arrays = np.array(target_arrays, dtype = object).astype(np.float32)

# Check
sequence_arrays.shape, target_arrays.shape

((5936, 3000, 3), (5936,))

In [10]:
# Then you'd generate the spectrogram for each sequence / feature.
# It'd be great if this could happen only when needed to make the next prediction
# so that we don't have to store all the spectrograms in memory at once.

# Generate Spectrograms

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import spectrogram

fs = 100  # Sampling frequency

output_directory = "/content/drive/MyDrive/Data/Spectrograms"
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for seq_index, sequence in enumerate(sequence_arrays):
    plt.figure(figsize=(14, 18))

    for feature_index in range(sequence.shape[1]):
        feature_sequence = sequence[:, feature_index]

        # Computing the spectrogram for the current feature in the sequence
        f, t, Sxx = spectrogram(feature_sequence, fs)

        ax = plt.subplot(sequence.shape[1], 1, feature_index + 1)
        plt.pcolormesh(t, f, 10 * np.log10(Sxx), shading='gouraud')
        ax.set_ylabel('Frequency [Hz]')
        ax.set_yscale('log')
        ax.set_ylim(0.5, 50)  # Adjust based on your data's frequency content
        ax.set_xlabel('Time [s]')
        ax.set_title(f'Spectrogram of Sequence {seq_index + 1}, Feature {feature_index + 1}')

    plt.suptitle(f'Spectrograms for Sequence {seq_index + 1}', fontsize=16, y=1.02)
    plt.tight_layout()

    # Saving the figure with all feature spectrograms for the current sequence
    image_name = f"Spectrograms_Sequence_{seq_index + 1}.png"
    plt.savefig(os.path.join(output_directory, image_name))
    plt.close()


In [1]:
import tensorflow as tf

class SpectrogramDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, spectrogram_paths, labels, batch_size=32, img_size=(128, 128), n_channels=1, shuffle=True):
        self.spectrogram_paths = spectrogram_paths
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.n_channels = n_channels
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.spectrogram_paths) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        spectrogram_paths_temp = [self.spectrogram_paths[k] for k in indexes]
        X, y = self.__data_generation(spectrogram_paths_temp)
        return X, y

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.spectrogram_paths))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, spectrogram_paths_temp):
        X = np.empty((self.batch_size, *self.img_size, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        for i, spectrogram_path in enumerate(spectrogram_paths_temp):
            img = tf.keras.preprocessing.image.load_img(spectrogram_path, target_size=self.img_size, color_mode="grayscale")
            X[i,] = tf.keras.preprocessing.image.img_to_array(img) / 255.
            y[i] = self.labels[spectrogram_path]

        return X, tf.keras.utils.to_categorical(y, num_classes=len(np.unique(list(self.labels.values()))))


In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, LSTM, Dense, TimeDistributed, Dropout

def create_cnn_lstm_model(input_shape, num_classes):
    model = Sequential()
    model.add(TimeDistributed(Conv2D(32, (3, 3), activation='relu'), input_shape=input_shape))
    model.add(TimeDistributed(MaxPooling2D(2, 2)))
    model.add(TimeDistributed(Flatten()))

    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [6]:
import os
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import TimeDistributed, Conv2D, MaxPooling2D, Flatten, LSTM, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Assuming spectrograms are saved and labels are prepared
output_directory = "/content/drive/MyDrive/Data/Spectrograms"

# Prepare your data generator for training
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
    output_directory,
    target_size=(128, 128),  # Assuming spectrogram size of 128x128
    color_mode='grayscale',
    batch_size=32,
    class_mode='categorical'  # Assuming more than 2 classes
)

# CNN-LSTM Model
model = Sequential([
    TimeDistributed(Conv2D(32, (3, 3), activation='relu'), input_shape=(None, 128, 128, 1)),
    TimeDistributed(MaxPooling2D(2, 2)),
    TimeDistributed(Flatten()),
    LSTM(64),
    Dense(64, activation='relu'),
    Dense(train_generator.num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_generator, epochs=10)


Found 0 images belonging to 0 classes.


ValueError: Asked to retrieve element 0, but the Sequence has length 0