# Configuration

In [53]:
# Set the subject to work with (use the actual #, not the index)
subject_number = 1

# Set the percentage of subject data to use for train and test sets.
subject_data_percentage = 0.1

# Sequence Length (in seconds)
sequence_length = 30

# Set the predictive horizon (in seconds)
predictive_horizon = 5

assert subject_number > 0, "Subject number must be greater than 0."
assert subject_data_percentage > 0, "Subject data percentage must be greater than 0."
assert predictive_horizon > 0, "Predictive horizon must be greater than 0."
assert sequence_length > 0, "Sequence length must be greater than 0."

# Setup

In [54]:
# Imports
import numpy as np
import pandas as pd
import glob
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.model_selection import train_test_split

In [55]:
# Constants
sequence_length = sequence_length * 100
predictive_horizon = predictive_horizon * 100

# Data Prep

### Import Data

In [56]:
# Read in files
files = glob.glob('data/ProcessedData_Subject*.csv')

data_dfs = []

for filepath in files:
    df = pd.read_csv(filepath)
    
    # Extract subject number from path
    current_subject_number = filepath.split('/')[-1].split('_Subject')[1].split('.')[0].lstrip('0')
    df['Subject Number'] = current_subject_number
    data_dfs.append(df)
    
    if str(current_subject_number) == str(subject_number): # Early exit condition since we're only running this for 1 subject
        break

# Check
#data_dfs[0].head()

### Clean Data

In [57]:
subject_df = data_dfs[subject_number - 1]

def remove_outliers(df, columns):
    z_scores = np.abs(stats.zscore(df[columns]))
    filtered_entries = (z_scores < 4).all(axis=1)  # Relaxing the threshold to 4
    return df[filtered_entries]

def standardize(df, columns):
    scaler = StandardScaler()
    df.loc[:, columns] = scaler.fit_transform(df[columns])
    return df

# Fill missing values with forward fill
subject_df.ffill(inplace=True)

# Automatically define columns to process by excluding 'Time [s]'
columns_to_process = df.columns.drop(['Time [s]', 'Subject Number']).tolist()

subject_df_clean = remove_outliers(df, columns_to_process)

subject_df_standardized = standardize(subject_df_clean, columns_to_process)

subject_df = subject_df_standardized

# Check
#subject_df.head()

### Prepare Train / Test Sets

In [58]:
size = int(len(subject_df) * subject_data_percentage)

# Set a seed for reproducibility
seed = 42
np.random.seed(seed)

# Ensure the random start does not make the slice exceed the DataFrame's length
max_start_index = len(subject_df) - size

# Choose a random start point (skipping the first 10 seconds of data)
start_index = np.random.randint(1000, max_start_index)

subset_df = subject_df.iloc[start_index:start_index + size]

train_df, test_df = train_test_split(subset_df, test_size=0.2)

train_df.reset_index()
test_df.reset_index()

# Checks
print("Original subject size: ", len(subject_df))
print("Subset size: ", len(subset_df))
print("Train set size: ", len(train_df))
print("Test set size: ", len(test_df))

Original subject size:  117578
Subset size:  11757
Train set size:  9405
Test set size:  2352


# Generate Sequences

In [60]:
sequence_arrays = []
target_arrays = []

feat_cols = columns_to_process # TODO: Not sure if we want to do predictions on all of these columns
target_cols = ['Flow [L/s]']

for i in range(0, len(train_df) - sequence_length - predictive_horizon):
    sequence_arrays.append(train_df.iloc[i:i + sequence_length][columns_to_process].values)
    target_arrays.append(train_df.iloc[i + sequence_length + predictive_horizon][target_cols].iloc[0])

# Convert to numpy arrays and floats    
sequence_arrays = np.array(sequence_arrays, dtype = object).astype(np.float32)
target_arrays = np.array(target_arrays, dtype = object).astype(np.float32)

# Check
sequence_arrays.shape, target_arrays.shape

((5905, 3000, 9), (5905,))

In [None]:
# Then you'd generate the spectrogram for each sequence / feature. 
# It'd be great if this could happen only when needed to make the next prediction 
# so that we don't have to store all the spectrograms in memory at once.