# Import Python libraries

In [1]:
import pandas as pd
import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest
from datetime import datetime, timedelta
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Load and process survey data

In [2]:
global_id_counter = 100  # starting with 100 to avoid overlapping with batch 1 users

def process_survey_data(survey_data):
    """
    Process the survey data by:
    1) Formatting the timestamp information to merge with other datasets.
    2) Renaming the column names align with other batches
    3) Assigning user ID information to each survey entry
    4) Retain relevant features
    
    Parameters:
        survey_data: The unprocessed survey data
        
    Returns:
        survey_data: The processed survey data.
    """
    global global_id_counter
    
    experiment_date_mapping = {
        datetime(2023, 9, 18).strftime("%Y-%m-%d"):1,
        datetime(2023, 9, 19).strftime("%Y-%m-%d"):1,
        datetime(2023, 9, 20).strftime("%Y-%m-%d"):2,
        datetime(2023, 9, 21).strftime("%Y-%m-%d"):2,
        datetime(2023, 9, 22).strftime("%Y-%m-%d"):3,
        datetime(2023, 9, 25).strftime("%Y-%m-%d"):3,
        datetime(2023, 9, 26).strftime("%Y-%m-%d"):4,
        datetime(2023, 9, 27).strftime("%Y-%m-%d"):4,
        datetime(2023, 9, 28).strftime("%Y-%m-%d"):5,
        datetime(2023, 9, 29).strftime("%Y-%m-%d"):5,
    }
    
    # format timestamp information to the nearest minute
    survey_data['Timestamp'] = pd.to_datetime(
        survey_data['Completion time'], 
        format="%Y-%m-%d %H:%M:%S"
    ).dt.round('T')

    # rename column names
    name_mapping = {
        'Right now, would you prefer overall environment to be?':'Thermal Preference',
        'Do you think current thermal environment (i.e., temperature, humidity, & airflow) acceptable?':'Thermal Acceptability',
        'How would you prefer air movement now?':'Air Movement Preference'
    }
    survey_data.rename(columns=name_mapping, inplace=True)
    
    # assign user id based on 2-day experiments
    survey_data['Date'] = survey_data['Timestamp'].apply(lambda x: x.date())
    survey_data['Experiment Day'] = survey_data['Date'].apply(lambda x: experiment_date_mapping[x.strftime("%Y-%m-%d")])
#     print(survey_data['Date'].unique())
#     print(survey_data['Experiment Day'].unique())
    unique_days = list(survey_data['Experiment Day'].unique())
    day_id_mapping = {}
    for day in unique_days:
        day_id_mapping[day] = global_id_counter
        global_id_counter += 1
    survey_data['User Id'] = survey_data['Experiment Day'].apply(lambda x: day_id_mapping[x])
    survey_data.drop(columns=['Date', 'Experiment Day'], inplace=True)
    
    # standardised labels for thermal acceptability and air movement preferece
    survey_data['Thermal Acceptability'] = survey_data["Thermal Acceptability"].replace("Clearly Acceptable", "Acceptable")
    survey_data['Thermal Acceptability'] = survey_data["Thermal Acceptability"].replace("Just Acceptable", "Acceptable")
    survey_data['Thermal Acceptability'] = survey_data["Thermal Acceptability"].replace("Clearly Unacceptable", "Unacceptable")
    survey_data['Thermal Acceptability'] = survey_data["Thermal Acceptability"].replace("Just Unacceptable", "Unacceptable")
    survey_data['Air Movement Preference'] = survey_data["Air Movement Preference"].replace("More Air Movement", "More")
    survey_data['Air Movement Preference'] = survey_data["Air Movement Preference"].replace("Less Air Movement", "Less")

    # retain relevant features
    survey_data = survey_data[[
        'Timestamp', 
        'Thermal Preference', 
        'Thermal Acceptability',
        'Air Movement Preference',
        'User Id'
    ]]
    
    # Drop rows with missing labels
    survey_data = survey_data.dropna().reset_index(drop=True)
    
    # sort survey data based on timestamp information in ascending order
    survey_data = survey_data.sort_values(by='Timestamp').reset_index(drop=True)
    
    return survey_data

In [3]:
survey_files = [
    'Survey#1.xlsx',
    'Survey#2.xlsx',
    'Survey#3.xlsx',
    'Survey#4.xlsx',
    'Survey#5.xlsx',
    'Survey#6.xlsx'
]

all_survey_data = pd.DataFrame()

for file in survey_files:
    survey_data = pd.read_excel(
        f'data/yue_data/Survey/{file}', 
        sheet_name='Form1'
    )
    processed_survey_data = process_survey_data(survey_data)
    all_survey_data = pd.concat(
        [all_survey_data, processed_survey_data], 
        ignore_index=True
    )

all_survey_data.head()

Unnamed: 0,Timestamp,Thermal Preference,Thermal Acceptability,Air Movement Preference,User Id
0,2023-09-18 09:30:00,No Change,Acceptable,No Change,100
1,2023-09-18 09:32:00,Cooler,Acceptable,More,100
2,2023-09-18 09:33:00,Cooler,Acceptable,More,100
3,2023-09-18 09:33:00,Cooler,Acceptable,More,100
4,2023-09-18 09:34:00,No Change,Acceptable,No Change,100


In [4]:
all_survey_data['User Id'].unique()

array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
       126])

# Load and process outdoor weather data

In [5]:
def process_weather_data(weather_data):
    """
    Process the weather data by:
    1) Formatting the timestamp information to merge with other datasets.
    2) Renaming the column names align with other batches
    
    Parameters:
        weather_data: The unprocessed weather data
        
    Returns:
        weather_data: The processed weather data.
    """
    # format timestamp information
    weather_data['Timestamp'] = pd.to_datetime(
        weather_data['Date'], 
        format="%m/%d/%y %H:%M:%S"
    ).dt.round('T')

    # rename column names
    name_mapping = {
        'Temperature (S-THC 21535782:21555952-1), *C, ideaslab(Aug23)':'Outdoor Temp',
        'RH (S-THC 21535782:21555952-2), %, ideaslab(Aug23)':'Outdoor Humidity',
    }
    weather_data.rename(
        columns=name_mapping, 
        inplace=True
    )
    
    # retain relevant features and rows based on study period
    weather_data = weather_data[[
        'Timestamp', 
        'Outdoor Temp', 
        'Outdoor Humidity',
    ]]
    
    survey_start_date = datetime(2023, 9, 18)
    survey_end_date = datetime(2023, 9, 29)
    weather_data = weather_data[
        (weather_data['Timestamp'] >= survey_start_date) & 
        (weather_data['Timestamp'] <= survey_end_date)
    ].reset_index(drop=True)
    
    # sort weather data based on timestamp information in ascending order
    weather_data = weather_data.sort_values(by='Timestamp').reset_index(drop=True)
    
    return weather_data

In [6]:
weather_data = pd.read_csv(f'data/yue_data/Environment measurement/Weather data in UTC timezone.csv')
weather_data = process_weather_data(weather_data)
weather_data.head()

Unnamed: 0,Timestamp,Outdoor Temp,Outdoor Humidity
0,2023-09-18 00:00:00,28.647,79.135
1,2023-09-18 00:05:00,28.797,78.471
2,2023-09-18 00:10:00,28.99,77.998
3,2023-09-18 00:15:00,29.118,77.398
4,2023-09-18 00:20:00,28.454,79.117


# Load and process globe temperature data

In [7]:
def process_globetemp_data(globetemp_door, globetemp_window):
    """
    Process the globe temperature data by:
    1) Formatting the timestamp information to merge with other datasets.
    2) Renaming the column names
    3) Merging the globe temperature data collected at the door and window based on 
    timestamp information rounded to the nearest minute
    4) Find the average globe temperature reading between the door and window sensors.
    
    Parameters:
        globetemp_door: The unprocessed globe temperature data collected at the door.
        globetemp_window: The unprocessed globe temperature data collected at the window.
        
    Returns:
        globetemp_data: The processed globe temperature data.
    """
    # format timestamp information and round it to nearest minute
    globetemp_door['Timestamp'] = pd.to_datetime(
        globetemp_door['Time'], 
        format="%Y/%m/%d %H:%M:%S.%f"
    ).dt.round('T')
    
    globetemp_window['Timestamp'] = pd.to_datetime(
        globetemp_door['Time'], 
        format="%Y/%m/%d %H:%M:%S.%f"
    ).dt.round('T')
    
    # rename column names
    globetemp_door.rename(
        columns={'CH1-1[C]':'Globe Temperature - Door'}, 
        inplace=True
    )
    
    globetemp_window.rename(
        columns={'CH1-1[C]':'Globe Temperature - Window'}, 
        inplace=True
    )
    
    # merge globe temperature data at door and windows
    globetemp_data = globetemp_door[['Timestamp', 'Globe Temperature - Door']].merge(
        globetemp_window[['Timestamp', 'Globe Temperature - Window']], 
        how='outer'
    )

    # find average globe temperature
    globetemp_data['Globe Temperature'] = globetemp_data['Globe Temperature - Door'].fillna(globetemp_data['Globe Temperature - Window']).mean()
    
    # retain relevant features and rows based on study period
    globetemp_data = globetemp_data[[
        'Timestamp', 
        'Globe Temperature', 
    ]]
    
    survey_start_date = datetime(2023, 9, 18)
    survey_end_date = datetime(2023, 9, 29)
    globetemp_data = globetemp_data[
        (globetemp_data['Timestamp'] >= survey_start_date) & 
        (globetemp_data['Timestamp'] <= survey_end_date)
    ].reset_index(drop=True)
    
    # sort weather data based on timestamp information in ascending order
    globetemp_data = globetemp_data.sort_values(by='Timestamp').reset_index(drop=True)
    
    return globetemp_data
    

In [8]:
globetemp_door1 = pd.read_csv(
    f'data/yue_data/Environment measurement/Globe temp/LR8515_180345475_180345475_2023-09-13_115253000_Door.csv', 
    skiprows=10
)
globetemp_door2 = pd.read_csv(
    f'data/yue_data/Environment measurement/Globe temp/LR8515_180345475_180345475_2023-09-28_191022000_Door.csv', 
    skiprows=10
)
globetemp_door = pd.concat(
    [globetemp_door1, globetemp_door2], 
    ignore_index=True
)
globetemp_window = pd.read_csv(
    f'data/yue_data/Environment measurement/Globe temp/LR8515_180349862_180349862_2023-09-13_120259000_Window.csv', 
    skiprows=10
)

globetemp_data = process_globetemp_data(globetemp_door, globetemp_window)
globetemp_data.head()

Unnamed: 0,Timestamp,Globe Temperature
0,2023-09-18 00:00:00,28.119314
1,2023-09-18 00:01:00,28.119314
2,2023-09-18 00:02:00,28.119314
3,2023-09-18 00:03:00,28.119314
4,2023-09-18 00:04:00,28.119314


# Load and process air temperature, RH and air speed data

In [9]:
def correct_timestamp(time_interval, start_time, delay):
    """
    Corrects the timestamp information by adding the time_interval to start_time and 
    accounting for delays.
    
    Parameters:
        time_interval: Time that have passed since start_time
        start_time: Time where data collection begins.
        delay: The amount of time delay in seconds that needs to be accounted for in the timestamp.
        
    Returns:
        final_timestamp
    """
    hour, minutes, sec = time_interval.split('.')[1].split(':')
    final_timestamp = start_time + timedelta(hours=int(hour), 
                                             minutes=int(minutes), 
                                             seconds=int(sec)) + timedelta(seconds=int(delay))
    return final_timestamp
    
def process_indoor_env_data(filename, start_time, delay):
    """
    Process the indoor environmental data by:
    1) Formatting the timestamp information and accounting for time delays.
    2) Find the average air temperature, humidity and air velocity data across 6 tables

    Parameters:
       filename: File name of the sensor data
       start_time: A datetime object containing the start date and time
       delay: The amount of time delay in seconds that needs to be accounted for in the timestamp.

    Returns:
        env_data: Dataframe containing the processed indoor environmental data with timestamp information.
    """
    # Load the raw indoor environmental data file
    env_data = pd.read_csv(
        f'data/yue_data/Environment measurement/Air temp, RH, air speed/{filename}', 
        skiprows=9
    )
    
    # format timestamp information and account for time delays. rounded to nearest minute
    env_data['Timestamp'] = env_data['Data header Date&Time '].apply(lambda x: correct_timestamp(x, start_time, delay))
    env_data['Timestamp'] = env_data['Timestamp'].dt.round('T')
    
    # find average air temp, humidity and air velocity across 6 tables
    env_data['Indoor Temp'] = env_data[['HUB01-01CH-T',
                                        'HUB01-02CH-T',
                                        'HUB01-03CH-T',
                                        'HUB01-04CH-T',
                                        'HUB01-05CH-T',
                                        'HUB01-06CH-T']].mean(skipna=True, axis=1)
    env_data['Indoor Humidity'] = env_data[['HUB01-01CH-H',
                                            'HUB01-02CH-H',
                                            'HUB01-03CH-H',
                                            'HUB01-04CH-H',
                                            'HUB01-05CH-H',
                                            'HUB01-06CH-H']].mean(skipna=True, axis=1)
    env_data['Air Velocity'] = env_data[['HUB01-01CH-V',
                                         'HUB01-02CH-V',
                                         'HUB01-03CH-V',
                                         'HUB01-04CH-V',
                                         'HUB01-05CH-V',
                                         'HUB01-06CH-V']].mean(skipna=True, axis=1)
    
    # retain relevant features and rows based on study period
    env_data = env_data[[
        'Timestamp', 
        'Indoor Temp',
        'Indoor Humidity',
        'Air Velocity',
    ]]
    
    # sort environmental data based on timestamp information in ascending order
    env_data = env_data.sort_values(by='Timestamp').reset_index(drop=True)
    
    return env_data
    
    

In [10]:
indoor_env_files = {
    '20230918_30 sec slower.CSV':{'Start Time': datetime(2023, 9, 18, 9, 0, 43, 0), 'Delay (s)': 30},
    '20230919_2 sec slower.CSV':{'Start Time': datetime(2023, 9, 19, 8, 50, 48, 0), 'Delay (s)': 2},
    '20230920_7 sec slower.CSV':{'Start Time': datetime(2023, 9, 20, 8, 42, 41, 0), 'Delay (s)': 7},
    '20230921_5 sec faster.CSV':{'Start Time': datetime(2023, 9, 21, 8, 54, 11, 0), 'Delay (s)': -5},
    '20230922.CSV':{'Start Time': datetime(2023, 9, 22, 8, 45, 47, 0), 'Delay (s)': 0},
    '20230925.CSV':{'Start Time': datetime(2023, 9, 25, 8, 35, 4, 0), 'Delay (s)': 0},
    '20230926_5 sec slower.CSV':{'Start Time': datetime(2023, 9, 26, 8, 50, 49, 0), 'Delay (s)': 5},
    '20230927.CSV':{'Start Time': datetime(2023, 9, 27, 8, 46, 57, 0), 'Delay (s)': 0},
    '20230928.CSV':{'Start Time': datetime(2023, 9, 28, 8, 43, 57, 0), 'Delay (s)': 0},
    '20230929.CSV':{'Start Time': datetime(2023, 9, 29, 8, 46, 23, 0), 'Delay (s)': 0},
}

indoor_env_data = pd.DataFrame()

for file in indoor_env_files.keys():
    processed_env_data = process_indoor_env_data(
        filename=file, 
        start_time=indoor_env_files[file]['Start Time'], 
        delay=indoor_env_files[file]['Delay (s)']
    )
    indoor_env_data = pd.concat([indoor_env_data, processed_env_data], ignore_index=True)
    
indoor_env_data.head()

Unnamed: 0,Timestamp,Indoor Temp,Indoor Humidity,Air Velocity
0,2023-09-18 09:01:00,26.75,69.266667,0.505
1,2023-09-18 09:01:00,26.683333,69.183333,0.576667
2,2023-09-18 09:02:00,26.75,69.1,0.473333
3,2023-09-18 09:02:00,26.75,68.983333,0.585
4,2023-09-18 09:02:00,26.733333,68.916667,0.628333


# Load and process mode data

In [11]:
def generate_minute_interval(data):
    """
    Generate a DataFrame with one-minute intervals with the corresponding "Mode" values based on
    the start time and end time defined in "data".

    Args:
        data: The input DataFrame containing "Start Time," "End Time," and "Mode" columns.

    Returns:
        minute_df: A new DataFrame with one-minute intervals and corresponding "Mode" values.

    
    """
    # Ensure the "Start Time" and "End Time" columns are in datetime format
    data['Start Time'] = pd.to_datetime(
        data['Start Time'], 
        format='%d/%m/%y %H:%M'
    )
    data['End Time'] = pd.to_datetime(
        data['End Time'],
        format='%d/%m/%y %H:%M'
    )

    # Create a list of minute-level timestamps between "Start Time" and "End Time"
    minute_intervals = []
    for index, row in data.iterrows():
        start_time = row['Start Time']
        end_time = row['End Time']
        mode = row['Mode']
        
        current_time = start_time
        while current_time <= end_time:
            minute_intervals.append((current_time, mode))
            current_time += pd.Timedelta(minutes=1)

    # Create a new DataFrame with minute intervals and corresponding "Mode" values
    minute_df = pd.DataFrame(minute_intervals, columns=['Timestamp', 'Mode'])
    
    return minute_df

In [12]:
mode_data = pd.read_csv('data/yue_data/experimental_mode.csv')
mode_data = generate_minute_interval(mode_data)
mode_data.head()

Unnamed: 0,Timestamp,Mode
0,2023-09-18 09:00:00,AC
1,2023-09-18 09:01:00,AC
2,2023-09-18 09:02:00,AC
3,2023-09-18 09:03:00,AC
4,2023-09-18 09:04:00,AC


# Merge survey, environmental and outdoor weather data

In [13]:
all_survey_data['Timestamp_5min'] = all_survey_data['Timestamp'].dt.round('5min')
merged_data = pd.merge(
    all_survey_data,
    weather_data, 
    left_on='Timestamp_5min', 
    right_on='Timestamp', 
    how='left', suffixes=('', '_delete')
)
merged_data.drop(columns=['Timestamp_5min', 'Timestamp_delete'], inplace=True)
merged_data = pd.merge(
    merged_data,
    globetemp_data, 
    on='Timestamp', 
    how='left', 
)
merged_data = pd.merge(
    merged_data,
    indoor_env_data, 
    on='Timestamp', 
    how='left', 
)
merged_data = pd.merge(
    merged_data,
    mode_data, 
    on='Timestamp', 
    how='left', 
)

# Reorder columns
merged_data = merged_data[[
    'Timestamp',
    'Mode',
    'Indoor Temp',
    'Indoor Humidity',
    'Air Velocity',
    'Globe Temperature',
    'Outdoor Temp',
    'Outdoor Humidity',
    'Thermal Preference',
    'Thermal Acceptability',
    'Air Movement Preference',
    'User Id'
]]

merged_data.head()

Unnamed: 0,Timestamp,Mode,Indoor Temp,Indoor Humidity,Air Velocity,Globe Temperature,Outdoor Temp,Outdoor Humidity,Thermal Preference,Thermal Acceptability,Air Movement Preference,User Id
0,2023-09-18 09:30:00,NV,26.316667,68.683333,0.718333,28.119314,29.805,70.019,No Change,Acceptable,No Change,100
1,2023-09-18 09:30:00,NV,26.333333,68.716667,0.721667,28.119314,29.805,70.019,No Change,Acceptable,No Change,100
2,2023-09-18 09:30:00,NV,26.35,68.733333,0.615,28.119314,29.805,70.019,No Change,Acceptable,No Change,100
3,2023-09-18 09:30:00,NV,26.333333,68.766667,0.658333,28.119314,29.805,70.019,No Change,Acceptable,No Change,100
4,2023-09-18 09:30:00,NV,26.3,68.8,0.721667,28.119314,29.805,70.019,No Change,Acceptable,No Change,100


In [14]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58839 entries, 0 to 58838
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Timestamp                58839 non-null  datetime64[ns]
 1   Mode                     58785 non-null  object        
 2   Indoor Temp              58824 non-null  float64       
 3   Indoor Humidity          58824 non-null  float64       
 4   Air Velocity             58824 non-null  float64       
 5   Globe Temperature        53349 non-null  float64       
 6   Outdoor Temp             53349 non-null  float64       
 7   Outdoor Humidity         53349 non-null  float64       
 8   Thermal Preference       58839 non-null  object        
 9   Thermal Acceptability    58839 non-null  object        
 10  Air Movement Preference  58839 non-null  object        
 11  User Id                  58839 non-null  int64         
dtypes: datetime64[ns](1), float64(6)

# Perform imputation for missing data using MissForest

In [15]:
def impute_missing_data(data):    
    original_col_sequence = list(data.columns)
    imputer = MissForest(criterion='squared_error')
    numerical_data = data.drop(columns=['Timestamp',
                                        'Mode',
                                        'Thermal Preference',
                                        'Thermal Acceptability',
                                        'Air Movement Preference', 
                                        'User Id'])
    numerical_data_cols = list(numerical_data.columns)
    imputed_numerical_data = imputer.fit_transform(numerical_data)
    imputed_numerical_data = pd.DataFrame(imputed_numerical_data, 
                                          columns=numerical_data_cols)
    imputed_data = pd.concat([data[['Timestamp',
                                    'Mode',
                                    'Thermal Preference',
                                    'Thermal Acceptability',
                                    'Air Movement Preference', 
                                    'User Id']], 
                              imputed_numerical_data], 
                             axis=1)
    imputed_data = imputed_data[original_col_sequence]

    assert imputed_data.shape == data.shape
    assert imputed_data["Timestamp"].tolist() == data["Timestamp"].tolist()

    return imputed_data

merged_data = impute_missing_data(merged_data)

Iteration: 0
Iteration: 1
Iteration: 2
Iteration: 3
Iteration: 4


In [16]:
# drop rows with missing mode labels and duplicated rows, and sort entries based on timestamp
merged_data = merged_data.dropna(subset=['Mode']).reset_index(drop=True)
merged_data = merged_data.drop_duplicates().reset_index(drop=True)
merged_data = merged_data.sort_values(by='Timestamp').reset_index(drop=True)
merged_data.drop(columns=['Timestamp'], inplace=True)

In [17]:
print(merged_data['Thermal Preference'].unique())
print(merged_data['Air Movement Preference'].unique())
print(merged_data['Thermal Acceptability'].unique())
print(merged_data['Mode'].unique())
print(merged_data.shape)
merged_data.head()

['No Change' 'Cooler' 'Warmer']
['No Change' 'More' 'Less']
['Acceptable' 'Unacceptable']
['AC' 'NV']
(58419, 11)


Unnamed: 0,Mode,Indoor Temp,Indoor Humidity,Air Velocity,Globe Temperature,Outdoor Temp,Outdoor Humidity,Thermal Preference,Thermal Acceptability,Air Movement Preference,User Id
0,AC,25.866667,64.083333,1.066667,28.119314,29.805,70.051,No Change,Acceptable,No Change,110
1,AC,25.85,64.133333,0.856667,28.119314,29.805,70.051,No Change,Acceptable,No Change,110
2,AC,25.883333,64.283333,0.833333,28.119314,29.805,70.051,No Change,Acceptable,No Change,110
3,AC,25.85,64.25,0.763333,28.119314,29.805,70.051,No Change,Acceptable,No Change,110
4,AC,25.933333,64.2,1.106667,28.119314,29.805,70.051,No Change,Acceptable,No Change,110


# Save processed data

In [18]:
# thermal preference data
thermal_pref_data = merged_data.drop(columns=['Thermal Acceptability','Air Movement Preference'])
thermal_pref_data = thermal_pref_data.dropna(subset=['Thermal Preference']).reset_index(drop=True)
assert thermal_pref_data.isnull().values.any() == False
thermal_pref_data.to_csv('data/bca_thermalpref_unsampled_batch2_data.csv', index=False)

In [19]:
# thermal acceptability data
thermal_acc_data = merged_data.drop(columns=['Thermal Preference','Air Movement Preference'])
thermal_acc_data = thermal_acc_data.dropna(subset=['Thermal Acceptability']).reset_index(drop=True)
assert thermal_acc_data.isnull().values.any() == False
thermal_acc_data.to_csv('data/bca_thermalacc_unsampled_batch2_data.csv', index=False)

In [20]:
# air movement preference data
air_pref_data = merged_data.drop(columns=['Thermal Acceptability','Thermal Preference'])
air_pref_data = air_pref_data.dropna(subset=['Air Movement Preference']).reset_index(drop=True)
assert air_pref_data.isnull().values.any() == False
air_pref_data.to_csv('data/bca_airpref_unsampled_batch2_data.csv', index=False)