In [None]:
import pandas as pd
import re
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5)


In [None]:
def check_code(code, manual, R1_metadata, R2_metadata):
    # checks that the code and manual are provided correctly and extracts the subject ID (code)  
    if code == "id":
        code_1 = R1_metadata["Subject ID"].iloc[0]
        code_2 = R2_metadata["Subject ID"].iloc[0]
    elif code == "id+comment":
        code_1 = R1_metadata["Subject ID"].iloc[0] + '_' + R1_metadata["Comments"].iloc[0]
        code_2 = R2_metadata["Subject ID"].iloc[0] + '_' + R2_metadata["Comments"].iloc[0]
    elif code == "manual" or manual != None:
        try:
            code_1 = manual[0]
            code_2 = manual[1]
        except ValueError as e:
            print("You have tried to enter a manual code (this is the filename that the metadata and data will be saved as). Please make sure your manual code is a list, e.g: ['1234_visit1', '5678_visit1'], where the first entry is for subject in room 1 and the second entry for subject in room 2.")
    else:
        raise ValueError("The value for the code parameter is not valid. Please choose id, id+comment or manual. Default is id.")
    
    return code_1, code_2

In [None]:
def extract_meta_data(lines, code, manual, save_csv, path_to_save):
    header_lines = [line.strip().split('\t') for line in lines[3:7]]

    data_R1 = dict(zip(header_lines[0][1:], header_lines[1]))
    data_R2 = dict(zip(header_lines[2][1:], header_lines[3]))

    R1_metadata = pd.DataFrame([data_R1])
    R2_metadata = pd.DataFrame([data_R2])

    code_1, code_2 = check_code(code, manual, R1_metadata, R2_metadata)
    
    if save_csv:
        room1_filename = f'{path_to_save}/{code_1}_WRIC_metadata.csv' if path_to_save else f'{code_1}_WRIC_metadata.csv'
        room2_filename = f'{path_to_save}/{code_2}_WRIC_metadata.csv' if path_to_save else f'{code_2}_WRIC_metadata.csv'
        R1_metadata.to_csv(room1_filename, index=False)
        R2_metadata.to_csv(room2_filename, index=False)
        
    return code_1, code_2, R1_metadata, R2_metadata

In [None]:
def open_file(filepath):
    # Check that the provided filepath is valid and leads to a WRIC file
    if not filepath.lower().endswith('.txt'):
        raise TypeError("The file must be a .txt file.")
    try:
        with open(filepath, "r") as file:
            lines = file.readlines()
            if not lines or not lines[0].startswith("OmniCal software"):
                raise ValueError("The provided file is not the WRIC data file.")
    except FileNotFoundError as e:
        print("The filepath you provided does not lead to a file.")
        
    return lines

In [None]:
def create_wric_df(filepath, lines, save_csv, code_1, code_2, path_to_save):
    # find start of data line
    for i, line in enumerate(lines):
        if line.startswith("Room 1 Set 1"):  # Detect where the actual data starts
            data_start_index = i + 1  # First data row starts after this
            break
    # Reading the data starting from where the table begins
    df = pd.read_csv(filepath, sep="\t", skiprows=data_start_index)
    # there are NaN rows after each Room&Set combination that need to be deleted
    df = df.dropna(axis=1, how='all')

    # define the new column names
    columns = [
        "Date", "Time", "VO2", "VCO2", "RER", "FiO2", "FeO2", "FiCO2", "FeCO2", 
        "Flow", "Activity Monitor", "Energy Expenditure (kJ/min)", "Energy Expenditure (kcal/min)", 
        "Pressure Ambient", "Temperature", "Relative Humidity"
    ]
    new_columns = []
    for set_num in ['S1', 'S2']:
        for room in ['R1', 'R2']:
            for col in columns:
                new_columns.append(f"{room}_{set_num}_{col}")
    df.columns = new_columns

    # Check that time and date columns are consistent across rows
    date_columns, time_columns = df.filter(like='Date'), df.filter(like='Time')
    if not (date_columns.nunique(axis=1).eq(1).all() and time_columns.nunique(axis=1).eq(1).all()):
        raise ValueError("Date or Time columns do not match in some rows")

    # Combine Date and Time to DateTime and drop all unecessary date/time columns
    df_filtered = df.filter(like='Date').iloc[:, 0].to_frame(name="Date").join(df.filter(like='Time').iloc[:, 0].to_frame(name="Time"))
    df_filtered['datetime'] = pd.to_datetime(df_filtered['Date'] + ' ' + df_filtered['Time'], format='%m/%d/%y %H:%M:%S')
    df_filtered = df_filtered.drop(columns=['Date', 'Time'])
    df = df_filtered.join(df.drop(columns=df.filter(like='Date').columns).drop(columns=df.filter(like='Time').columns))
    
    
    # Split dataset by room
    df_room1 = df.filter(like='R1')
    df_room2 = df.filter(like='R2')

    if save_csv:
        room1_filename = f'{path_to_save}/{code_1}_WRIC_data.csv' if path_to_save else f'{code_1}_WRIC_data.csv'
        room2_filename = f'{path_to_save}/{code_2}_WRIC_data.csv' if path_to_save else f'{code_2}_WRIC_data.csv'
        df_room1.to_csv(room1_filename, index=False)
        df_room2.to_csv(room2_filename, index=False)
        
    return df_room1, df_room2


In [None]:
def preprocess_WRIC_file(filepath, code = "id", manual = None, save_csv = True, path_to_save = None):
     
    lines = open_file(filepath)
    code_1, code_2, R1_metadata, R2_metadata = extract_meta_data(lines, code, manual, save_csv, path_to_save)
    df_room1, df_room2 = create_wric_df(filepath, lines, save_csv, code_1, code_2, path_to_save)
    
    # TODO: Change return to return the two seperat files
    return R1_metadata, R2_metadata, df_room1, df_room2
    

In [None]:
# TODO: Check and raise error if big discrepancies between measurement set 1 and 2 exist
# TODO: Find reasonable threshold, where discrepancies are too big? - stay simple, no extravagant reports

def check_discrepancies(df, threshold):
    # Filter out environment parameters (assumed they contain these keywords)
    env_params = ['Pressure Ambient', 'Temperature', 'Relative Humidity']
    # TODO: Maybe do it only for energy expenditure (there simon uses delat of less than 1% I think - rest I do not know)
    df_filtered = df.loc[:, ~df.columns.str.contains('|'.join(env_params))]
    
    s1_columns = df_filtered.filter(like='_S1_').columns
    s2_columns = df_filtered.filter(like='_S2_').columns
    print(s1_columns)
    
    discrepancies = []
    
    # Loop over the S1 and S2 column pairs
    for s1_col, s2_col in zip(s1_columns, s2_columns):
        s1_values = df[s1_col]
        s2_values = df[s2_col]
        
        # Check if the mean variance exceeds the threshold
        variance = np.var(np.array(s1_values)-np.array(s2_values))
        if variance > threshold:
            discrepancies.append(f"{s1_col} and {s2_col} have mean variance {variance:.4f} above threshold {threshold}")
        else:
            discrepancies.append(f"{s1_col} and {s2_col} have mean variance {variance:.4f} below threshold {threshold}")
        
        # Check individual values for discrepancies beyond the threshold
        """for i, (s1_val, s2_val) in enumerate(zip(s1_values, s2_values)):
            if abs(s1_val - s2_val) > threshold:
                discrepancies.append(f"Row {i+1}: {s1_col} and {s2_col} differ by {abs(s1_val - s2_val):.4f}")"""
    
    # Output the discrepancies
    if discrepancies:
        for discrepancy in discrepancies:
            print(discrepancy)
    else:
        print("No discrepancies found.")



In [None]:
# Example usage
# Assuming df_room1 is the dataframe with both S1 and S2 columns
# threshold = 0.5  # Set the desired threshold for variance and individual discrepancies
# compare_s1_s2(df_room1, threshold)



In [None]:
# TODO: Create a unified dataframe by combining both measurements with various methods
# TODO: Find best practice as default option - Simon: mean

### Example Usage

In [None]:
R1_metadata, R2_metadata, df_room1, df_room2 = preprocess_WRIC_file("C:\Documents\WRIC_example_data\Results_1m_copy_anonymised.txt", code="id+comment", path_to_save=None) #path_to_save="C:\Documents\WRIC_example_data"
display(df_room1)
check_discrepancies(df_room1, threshold=0.05)

#TODO: Check thresholds, discrepancies etc for the various parameters (why is it so high vor kcal, but not for kjoul) -> double check calculations for mean variance