In [5]:
import os
import pandas as pd

if os.getcwd().endswith('notebooks'):
    os.chdir('..')

def load_data_from_directory(directory_path):
    """Loads CSV files from a given directory, adding a 'subject' column based on filename."""
    dataframes = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".csv"):
            subject = int(filename.split('-')[0])
            file_path = os.path.join(directory_path, filename)
            df = pd.read_csv(file_path)
            df['subject'] = subject
            dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

def split_data(df, train_ratio=0.8):
    """Splits the data into training and validation sets based on a given ratio."""
    train_rows = int(train_ratio * len(df))
    df_train = df.iloc[:train_rows, :]
    df_val = df.iloc[train_rows:, :]
    return df_train, df_val

def process_data(data):
    """Processes the data by modifying timestamp, filling missing values, and interpolating."""
    data['5minute_intervals_timestamp'] = pd.to_datetime('1970-01-01') + pd.to_timedelta(data['5minute_intervals_timestamp'] * 5, unit='m')
    cleaned_data = data.drop(columns=['finger', 'hr'])
    cleaned_data['carbInput'] = cleaned_data['carbInput'].fillna(0)
    cleaned_data['bolus'] = cleaned_data['bolus'].fillna(0)
    cleaned_data['basal'] = cleaned_data.groupby('subject')['basal'].transform(lambda x: x.fillna(x.median()))
    cleaned_data['gsr'] = cleaned_data['gsr'].fillna(0)
    cleaned_data['cbg'] = cleaned_data['cbg'].interpolate('linear')
    return cleaned_data

def save_data(df, filename):
    """Saves a DataFrame to a CSV file."""
    df.to_csv(filename, index=False)

def main():
    # Load and process training and validation data
    train_data = load_data_from_directory('data/train')
    train_df, val_df = split_data(train_data)

    train_processed = process_data(train_df)
    val_processed = process_data(val_df)

    save_data(train_processed, 'data/train_processed.csv')
    save_data(val_processed, 'data/val_processed.csv')

    # Load and process test data
    test_data = load_data_from_directory('data/test')
    test_processed = process_data(test_data)

    save_data(test_processed, 'data/test_processed.csv')

    # Print shapes as a quick check
    print("Training DataFrame shape:", train_processed.shape)
    print("Validation DataFrame shape:", val_processed.shape)
    print("Test DataFrame shape:", test_processed.shape)

# Run the main function if the script is executed
if __name__ == "__main__":
    main()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['5minute_intervals_timestamp'] = pd.to_datetime('1970-01-01') + pd.to_timedelta(data['5minute_intervals_timestamp'] * 5, unit='m')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['5minute_intervals_timestamp'] = pd.to_datetime('1970-01-01') + pd.to_timedelta(data['5minute_intervals_timestamp'] * 5, unit='m')


Training DataFrame shape: (122431, 8)
Validation DataFrame shape: (30608, 8)
Test DataFrame shape: (35909, 8)
