<a href="https://colab.research.google.com/github/kellerflint/IoT-Final-Project/blob/angelb/data_prep_indivual.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Outliers, Missing Values, Standardize

In [None]:
import pandas as pd

In [None]:
import os
import glob
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler

def remove_outliers(df, columns):
    z_scores = np.abs(stats.zscore(df[columns]))
    filtered_entries = (z_scores < 4).all(axis=1)  # Relaxing the threshold to 4
    return df[filtered_entries]

def standardize(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

# Original file pattern
file_pattern = 'Data/ProcessedData_Subject*.csv'
files = glob.glob(file_pattern)

# New directory where the processed files will be saved
new_directory = r"C:\Users\eterp\OneDrive\Documents\USD\IoT\StandardData"

for filepath in files:
    df = pd.read_csv(filepath)

    # Fill missing values with forward fill
    df.fillna(method='ffill', inplace=True)

    # Automatically define columns to process by excluding 'Time [s]'
    columns_to_process = df.columns.drop('Time [s]').tolist()

    df_clean = remove_outliers(df, columns_to_process)

    # Check if df_clean is empty
    if not df_clean.empty:
        df_standardized = standardize(df_clean, columns_to_process)

        # Define the new filename based on the original filename
        original_filename = os.path.basename(filepath)
        new_filename = original_filename.replace('ProcessedData_', 'ProcessedStandardizedData_')

        # Construct the new file path
        new_filepath = os.path.join(new_directory, new_filename)

        # Save the processed file to the new directory
        df_standardized.to_csv(new_filepath, index=False)
        print(f'Processed and saved: {new_filepath}')
    else:
        print(f'No data left after outlier removal in file: {filepath}. Skipping standardization and saving.')

In [None]:
import glob
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os

# Absolute directory path to match the standardized data files
directory_path = r"C:\Users\eterp\OneDrive\Documents\USD\IoT\StandardData"
file_pattern = os.path.join(directory_path, 'ProcessedStandardizedData_Subject*.csv')
files = glob.glob(file_pattern)

# List of variables to plot
variables_to_plot = [
    'Pressure [cmH2O]',
    'Flow [L/s]',
    'V_tidal [L]',
    'Chest [mm]',
    'Abd [mm]',
    'Inspiratory Indicies',
    'Time (Aeration Data)_[s]',
    'Global Aeration',
    'Inspiratory Indicies (Aeration Data)',
]

for filepath in files:
    # Read the CSV file
    df = pd.read_csv(filepath)

    # Convert 'Time [s]' to seconds for plotting
    df['Time [s]'] = pd.to_timedelta(df['Time [s]'], unit='s').dt.total_seconds()

    # Set the figure size and the layout for subplots
    plt.figure(figsize=(14, 18))

    # Create subplots for each variable
    for i, variable in enumerate(variables_to_plot, start=1):
        # Check if the variable/column exists in the dataframe
        if variable in df.columns:
            ax = plt.subplot(len(variables_to_plot), 1, i)
            ax.plot(df['Time [s]'], df[variable])
            ax.set_title(f'{variable} over Time')
            ax.set_xlabel('Time [s]')
            ax.set_ylabel(variable)

            # Use a Linear scale and format the x-axis to show non-scientific notation
            ax.ticklabel_format(style='plain', axis='x')
            ax.xaxis.set_major_formatter(ticker.ScalarFormatter(useOffset=False))
            ax.xaxis.get_major_formatter().set_scientific(False)
        else:
            print(f"The column {variable} is not present in the file {filepath}.")

    # Adjust the layout so that all plots fit well
    plt.tight_layout()

    # Display the plots
    plt.show()