In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from missingpy import MissForest #impute missing value
from sklearn.preprocessing import MinMaxScaler #standardized data
import random
import matplotlib.pyplot as plt

#deep learning package
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import tensorflow as tf
from tensorflow.keras.layers import Conv2D, Input
from tensorflow.keras.models import Model

import keras
from keras import layers

import preprocessing



In [None]:
fitbit_survey2 = pd.read_csv('fitbit_survey2.csv')
select_survey = pd.read_csv('select_survey.csv')

fitbit_survey2.drop(columns = ['Unnamed: 0'], inplace = True)
select_survey.drop(columns = ['Unnamed: 0'], inplace = True)

Padding

In [10]:
mental_health1 =  select_survey['mental_health_1'].values
mental_health2 =  select_survey['mental_health_2'].values

ids = fitbit_survey2['Id'].unique()
max_short_day = 0
max_long_day = 0
for id in ids:
    individual_data_short = fitbit_survey2[(fitbit_survey2['Id'] == id) & (fitbit_survey2['survey_date'] < 2)]
    individual_data_long = fitbit_survey2[(fitbit_survey2['Id'] == id) & (fitbit_survey2['survey_date'] < 4)]
    if len(individual_data_short) > max_short_day:
        max_short_day = len(individual_data_short)

    if len(individual_data_long) > max_long_day:
        max_long_day = len(individual_data_long)

# List to store padded matrices for each ID
padded_matrices = []

for id_val in ids:
    # Filter data for the current `id_val` and specified condition
    id_data = fitbit_survey2[(fitbit_survey2['Id'] == id_val) & (fitbit_survey2['survey_date'] < 2)]

    max_rows = max_short_day
    col = id_data.shape[1]
    
    # Convert to matrix (numpy array) without 'Id' and 'survey_date' columns
    id_matrix = id_data.drop(columns=['Id', 'survey_date']).values
    
    # Pad each matrix to the target shape (max_rows, max_cols)
    padded_id_matrix = np.pad(id_matrix, ((0, max_rows - id_matrix.shape[0]), (0, col - id_matrix.shape[1])), mode='constant', constant_values=0) 
    #pad the dataset with the same length with 0 and the padding value will be ignored by masking
    
    # Append the padded matrix to the list
    padded_matrices.append(padded_id_matrix) #add 2D matrix to 3D

# Stack all matrices into a single 3D array (number of IDs, max_rows, max_cols)
final_padded_matrix = np.stack(padded_matrices, axis=0) 


# List to store padded matrices for each ID
padded_matrices2 = []

for id_val in ids:
    # Filter data for the current `id_val` and specified condition
    id_data = fitbit_survey2[(fitbit_survey2['Id'] == id_val) & (fitbit_survey2['survey_date'] < 4)]

    max_rows = max_long_day
    col = id_data.shape[1]
    
    # Convert to matrix (numpy array) without 'Id' and 'survey_date' columns
    id_matrix = id_data.drop(columns=['Id', 'survey_date']).values
    
    # Pad each matrix to the target shape (max_rows, max_cols)
    padded_id_matrix = np.pad(id_matrix, ((0, max_rows - id_matrix.shape[0]), (0, col - id_matrix.shape[1])), mode='constant', constant_values=0) 
    #pad the dataset with the same length with 0 and the padding value will be ignored by masking
    
    # Append the padded matrix to the list
    padded_matrices2.append(padded_id_matrix) #add 2D matrix to 3D

# Stack all matrices into a single 3D array (number of IDs, max_rows, max_cols)
final_padded_matrix2 = np.stack(padded_matrices2, axis=0) 

Prepare matrix for training

In [11]:
X_short_data = final_padded_matrix.copy()
X_long_data = final_padded_matrix2.copy()
y_short_data = mental_health1.copy()
y_long_data = mental_health2.copy()

In [12]:
#standardize the data
scaler = MinMaxScaler()

X_reshpaed = X_short_data.reshape(-1, X_short_data.shape[2])
X_standardized = scaler.fit_transform(X_reshpaed)
X_short_standardized = X_standardized.reshape(X_short_data.shape)

X_reshpaed2 = X_long_data.reshape(-1, X_long_data.shape[2])
X_standardized2 = scaler.fit_transform(X_reshpaed2)
X_short_standardized2 = X_standardized2.reshape(X_long_data.shape) #

In [13]:
def train_test_split(dataX, datay, shuffle = True, train_percentage = 0.7, val_percentage = 0.15):
    """
    parameters:
    dataX: the feature data
    datay: the labels
    shuffle: whether to shuffle the data before splitting
    train_percentage: proportion of data to use for the training set
    val_percentage: proportion of data to use for the validation set

    returns
    data for training, testing, and validating
    """
    if shuffle:
        random_indices = np.arange(len(dataX))
        np.random.shuffle(random_indices)
        dataX = dataX[random_indices]
        datay = datay[random_indices]
    
    # Compute split indices
    train_end = int(len(dataX) * train_percentage)
    val_end = train_end + int(len(dataX) * val_percentage)
    
    # Split the data
    train_X, train_y = dataX[:train_end], datay[:train_end]
    val_X, val_y = dataX[train_end:val_end], datay[train_end:val_end]
    test_X, test_y = dataX[val_end:], datay[val_end:]
    
    return train_X, train_y, val_X, val_y, test_X, test_y 

In [14]:
#split data to training, testing, and validating
train_X_data, train_y_data, val_X_data, val_y_data, test_X_data, test_y_data = train_test_split(X_short_standardized, y_short_data) 

Tune hyperparameters for CNN-LSTM model

In [None]:
from model import CNN_LSTM

In [16]:
# Hyperparameter space
hyperparameter_space = {
    'hidden_size': [4, 8, 16, 32, 64],
    'num_layers': [1, 2, 3, 4],
    'learning_rate': [0.01, 0.001, 0.0001],
    'batch_size': [4, 8, 16],
}

# hyperparameter_space = {
#     'embed_size': [16, 32, 64],
#     'conv_input': [8, 16, 32],
#     'input_size': [8, 16, 32],
#     'hidden_size': [32, 64, 128],
#     'num_layers': [1, 2, 3],
#     'batch_size': [16, 32, 64],
#     'learning_rate': [0.01, 0.001, 0.0001]
#     #'num_epochs': [10, 20, 30] #do I need to tune this hyperparameter
# }

#transfer data to torch data
# Convert training data to PyTorch tensors
train_X_new = torch.tensor(train_X_data, dtype=torch.float32)  # Convert train_X to float32 tensor
train_y_new = torch.tensor(train_y_data, dtype=torch.float32)  # Convert train_y to float32 tensor

# Convert testing data to PyTorch tensors
val_X_new = torch.tensor(val_X_data, dtype=torch.float32)    # Convert test_X to float32 tensor
val_y_new = torch.tensor(val_y_data, dtype=torch.float32)    # Convert test_y to float32 tensor

# Generate synthetic data
train_X = train_X_new  # (samples, sequence length, features)
train_y = train_y_new        # Corresponding labels
train_seq_lengths = torch.randint(1, 110, (100,))  # Sequence lengths

val_X = val_X_new
val_y = val_y_new
val_seq_lengths = torch.randint(1, 110, (20,))

# Number of random search trials
num_trials = 10
best_val_loss = float('inf')
best_hyperparams = {}

# Random search loop
for _ in range(num_trials):
    # Randomly sample hyperparameters
    hidden_size = random.choice(hyperparameter_space['hidden_size'])
    num_layers = random.choice(hyperparameter_space['num_layers'])
    learning_rate = random.choice(hyperparameter_space['learning_rate'])
    batch_size = random.choice(hyperparameter_space['batch_size'])

    # Create model with sampled hyperparameters
    model = CNN_LSTM(input_size = 39, hidden_size=hidden_size, num_layers=num_layers, output_size=1)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.BCEWithLogitsLoss()  # For binary classification
    
    # Training loop with current hyperparameters
    num_epochs = 30
    train_losses = []
    test_losses = []

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        # Shuffle training data
        random_indices = np.random.permutation(len(train_X))
        train_X, train_y, train_seq_lengths = train_X[random_indices], train_y[random_indices], train_seq_lengths[random_indices]

        for i in range(0, len(train_X), batch_size):
            batch_X = train_X[i:i + batch_size]
            batch_y = train_y[i:i + batch_size]
            batch_seq_lengths = train_seq_lengths[i:i + batch_size]

            optimizer.zero_grad()
            output = model(batch_X, batch_seq_lengths)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_X)
        train_losses.append(avg_loss)

        # Validation step
        model.eval()
        total_test_loss = 0
        with torch.no_grad():
            for i in range(len(val_X)):
                test_X1 = val_X[i].unsqueeze(0)
                test_y1 = val_y[i].unsqueeze(0)
                test_seq_length = val_seq_lengths[i].unsqueeze(0)
                test_output = model(test_X1, test_seq_length)
                test_loss = criterion(test_output, test_y1)
                total_test_loss += test_loss.item()

        avg_test_loss = total_test_loss / len(val_X)
        test_losses.append(avg_test_loss)

    # Check if current hyperparameters are the best
    if avg_test_loss < best_val_loss:
        best_val_loss = avg_test_loss
        best_hyperparams = {
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'learning_rate': learning_rate,
            'batch_size': batch_size
        }

# Print best results
print(f"Best Validation Loss: {best_val_loss}")
print("Best Hyperparameters:", best_hyperparams)



KeyboardInterrupt



Test the model

In [None]:
# Define the CNN+LSTM model class
class CNN_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(CNN_LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.conv = nn.Conv1d(input_size, output_size, kernel_size=2) #, activation = 'relu' # Set conv input to match feature size (39)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, seq_lengths):
        # Pass through Conv1d layer
        x = x.permute(0, 2, 1)  # Rearrange for Conv1d: (batch_size, features, seq_len)
        x = self.conv(x)
        
        # Pack the sequence to handle variable lengths
        x = x.permute(0, 2, 1)  # Rearrange back for LSTM: (batch_size, seq_len, features)
        packed_input = pack_padded_sequence(x, seq_lengths, batch_first=True, enforce_sorted=False)
        
        #print(packed_input.shape)
        # Initialize hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        print(h0.shape)
        print(c0.shape)
        # Pass through LSTM
        packed_output, _ = self.lstm(packed_input, (h0, c0))
        
        # Unpack the sequence
        out, _ = pad_packed_sequence(packed_output, batch_first=True)
        
        # Get the output of the last valid time step
        out = out[torch.arange(out.size(0)), seq_lengths - 1]
        
        # Pass through the fully connected layer
        out = self.fc(out)

        # Apply softmax
        #out = torch.sigmoid(out, dim=1)
        
        return out
        

In [61]:
# Define other parameters and hyperparameters
hidden_size = 32
num_layers = 3
batch_size = 8
learning_rate = 0.01
num_epochs = 30
filters = 64
input_size = 39 #(train_X_new.shape[1], train_X_new.shape[2])  # Feature size, input size is the matrix or only te number of features
output_size = 1  #set to 1 (for a single outcput with a sigmoid activation)

In [71]:
# transfer data to torch data
# Convert training data to PyTorch tensors
train_X_new = torch.tensor(train_X_data, dtype=torch.float32)  # Convert train_X to float32 tensor
train_y_new = torch.tensor(train_y_data, dtype=torch.float32)  # Convert train_y to float32 tensor
train_seq_lengths = torch.tensor([len(seq) for seq in train_X_data])  # Calculate sequence lengths for training data

# Convert testing data to PyTorch tensors
test_X_new = torch.tensor(test_X_data, dtype=torch.float32)    # Convert test_X to float32 tensor
test_y_new = torch.tensor(test_y_data, dtype=torch.float32)    # Convert test_y to float32 tensor
test_seq_lengths = torch.tensor([len(seq) for seq in test_X_data])  # Calculate sequence lengths for test data
# Create the CNN_LSTM model instance
model = CNN_LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, output_size=output_size)

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate) #betas=(0.5, 0.999)
criterion = nn.BCEWithLogitsLoss()  # For binary classification

train_losses = []
test_losses = []

print("Start Training")

# Training loop
for epoch in range(num_epochs):
    # Shuffle the training data
    random_indices = np.random.permutation(len(train_X_new))
    train_X_shuffled  = train_X_new[random_indices]
    train_y_shuffled  = train_y_new[random_indices]
    train_seq_lengths_shuffled = train_seq_lengths[random_indices]

    model.train()
    total_loss = 0

    for i in range(len(train_X_shuffled)):
        # Select a single sample (batch size = 1)
        train_X1 = train_X_shuffled[i].unsqueeze(0)  # Shape (1, seq_len, features)
        train_y1 = train_y_shuffled[i].unsqueeze(0)  # Shape (1, output_size)
        seq_length = train_seq_lengths_shuffled[i].unsqueeze(0)  # Shape (1, )

        # Forward pass
        optimizer.zero_grad()
        output = model(train_X1, seq_length)#.squeeze() #squeeze to remove extra dimensions
        loss = criterion(output, train_y1)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_X)
    train_losses.append(avg_loss)

    # Validation
    model.eval()
    total_test_loss = 0

    with torch.no_grad():
        for i in range(len(test_X_new)):
            test_X1 = test_X_new[i].unsqueeze(0)
            test_y1 = test_X_new[i].unsqueeze(0)
            test_seq_length = test_seq_lengths[i].unsqueeze(0) 

            test_output = model(test_X1, test_seq_length)#.squeeze()
            test_loss = criterion(test_output, test_y1)
            total_test_loss += test_loss.item()

    avg_test_loss = total_test_loss / len(test_X_new)
    test_losses.append(avg_test_loss)

    # Log the progress
    if epoch % 1 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_loss:.4f}, Test Loss: {avg_test_loss:.4f}")


Start Training


AttributeError: 'PackedSequence' object has no attribute 'shape'

In [66]:
train_X1.shape

torch.Size([1, 109, 39])

In [67]:
seq_length.shape

torch.Size([1])