# Import libraries

In [1]:
import os
import pandas as pd
import random
random.seed(1)
import argparse
from copy import deepcopy
from alipy import ToolBox
from alipy.index import IndexCollection
from tqdm import tqdm
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# define model parameters for AL
parser = argparse.ArgumentParser(description='Active learning for thermal preference')
parser.add_argument('--num_bagging', type=int, default=5)
parser.add_argument('--input_features', type=list, default=['Mode',
                                                            'Indoor Temp',
                                                            'Indoor Humidity',
                                                            'Air Velocity',
                                                            'Globe Temperature',
                                                            'Outdoor Temp',
                                                            'Outdoor Humidity'])
args = parser.parse_args(args=[])

# Load BCA and ASHRAE datasets

In [2]:
bca_thermalpref = pd.read_csv('data/bca_thermalpref_unsampled_data.csv')
bca_thermalacc = pd.read_csv('data/bca_thermalacc_unsampled_data.csv')
bca_airpref = pd.read_csv('data/bca_airpref_unsampled_data.csv')

ashrae_thermalpref = pd.read_csv('data/ashrae_thermalpref_sampled_data.csv')
ashrae_thermalacc = pd.read_csv('data/ashrae_thermalacc_sampled_data.csv')
ashrae_airpref = pd.read_csv('data/ashrae_airpref_sampled_data.csv')

In [3]:
bca_thermalpref.head()

Unnamed: 0,Mode,Indoor Temp,Indoor Humidity,Air Velocity,Globe Temperature,Outdoor Temp,Outdoor Humidity,Thermal Preference,User Id,Condition
0,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,No Change,1.0,2.0
1,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,Cooler,2.0,2.0
2,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,No Change,4.0,2.0
3,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,Cooler,5.0,2.0
4,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,Cooler,6.0,2.0


In [4]:
bca_thermalacc.head()

Unnamed: 0,Mode,Indoor Temp,Indoor Humidity,Air Velocity,Globe Temperature,Outdoor Temp,Outdoor Humidity,Thermal Acceptability,User Id,Condition
0,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,Acceptable,1.0,2.0
1,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,Acceptable,2.0,2.0
2,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,Acceptable,4.0,2.0
3,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,Unacceptable,5.0,2.0
4,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,Acceptable,6.0,2.0


In [5]:
bca_airpref.head()

Unnamed: 0,Mode,Indoor Temp,Indoor Humidity,Air Velocity,Globe Temperature,Outdoor Temp,Outdoor Humidity,Air Movement Preference,User Id,Condition
0,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,No Change,1.0,2.0
1,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,No Change,2.0,2.0
2,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,No Change,4.0,2.0
3,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,More,5.0,2.0
4,AC,26.224357,78.84082,0.53,27.35,83.0,73.0,More,6.0,2.0


In [6]:
bca_thermalpref['Condition'].unique()

array([ 2.,  1.,  4., 16., 17., 19., 18.,  3.])

In [7]:
bca_thermalpref['Mode'].unique()

array(['AC', 'NV'], dtype=object)

# Perform data sampling for training and validation purposes

## Randomly sample 15 labelled instances per participant from BCA dataset for validation purposes

In [8]:
def train_test_split(data, target_col):
    """
    Splits the thermal comfort data into training and test sets by randomly
    sampling 10 labelled instances for each unique user for validation while
    the rest of the labelled instances are for training purposes.
    Users who have less than 20 labelled instances are dropped.
    
    Parameters:
        data: The dataframe containing thermal comfort data collected from BCA.
        target_col: The substring indicating the target column.
    
    Return:
        train_data: The training datset
        test_data: The test dataset
    """
    if os.path.exists(f'data/bca_{target_col}_train_data.csv') and \
        os.path.exists(f'data/bca_{target_col}_test_data.csv'):
        train_data = pd.read_csv(f'data/bca_{target_col}_train_data.csv')
        test_data = pd.read_csv(f'data/bca_{target_col}_test_data.csv')
        return train_data, test_data
    
    test_data = pd.DataFrame()
    train_data = pd.DataFrame()
    for user in data['User Id'].unique():
        user_df = data[data['User Id'] == user]
        user_df = user_df.sample(frac=1).reset_index(drop=True)

        if len(user_df) < 10:
            continue

        test_data = pd.concat([test_data, user_df.iloc[-10:]], ignore_index=True)
        train_data = pd.concat([train_data, user_df.iloc[:-10]], ignore_index=True)
        
    train_data = train_data.sample(frac=1).reset_index(drop=True)
    test_data = test_data.sample(frac=1).reset_index(drop=True)
    
    return train_data, test_data

In [9]:
# generate training and test data for thermal preference dataset
thermalpref_train, thermalpref_test = train_test_split(bca_thermalpref, 
                                                       target_col='thermalpref')
thermalpref_train.to_csv('data/bca_thermalpref_train_data.csv', index=False)
thermalpref_test.to_csv('data/bca_thermalpref_test_data.csv', index=False)

# generate training and test data for thermal acceptability dataset
thermalacc_train, thermalacc_test = train_test_split(bca_thermalacc, 
                                                     target_col='thermalacc')
thermalacc_train.to_csv('data/bca_thermalacc_train_data.csv', index=False)
thermalacc_test.to_csv('data/bca_thermalacc_test_data.csv', index=False)

# generate training and test data for air movement preference dataset
airpref_train, airpref_test = train_test_split(bca_airpref,
                                               target_col='airpref')
airpref_train.to_csv('data/bca_airpref_train_data.csv', index=False)
airpref_test.to_csv('data/bca_airpref_test_data.csv', index=False)

## Randomly sample 1 labelled instance per Condition for each participant from BCA dataset for training purposes

In [10]:
def sample_n_per_user(data, target_col, segment_condition, sample_size=1):
    """
    Randomly samples n labelled instances from each segment condition for each user for target domain
    retraining and saves the sampled data as a CSV file.
    
    Parameters:
        data: The dataframe containing comfort data collected from BCA (target domain).
        target_col: The substring indicating the target column.
        segment_condition: The field indicating how the user data will be segmented for sampling.
        sample_size: An integer indicating the number of labelled instances to sample
        from each subsegment per user.
    """
    name_mapping = {
        'Thermal Preference':'thermalpref',
        'Thermal Acceptability':'thermalacc',
        'Air Movement Preference':'airpref',
    }
    
    if os.path.exists(f'data/bca_{name_mapping[target_col]}_train_{segment_condition.lower()}_random_data.csv'):
        return None
    
    sampled_data = pd.DataFrame()
    
    for user in data['User Id'].unique():
        user_data = data[data['User Id'] == user].reset_index(drop=True)
        
        for segment in user_data[segment_condition].unique():
            segmented_data = user_data[user_data[segment_condition] == segment]
            segmented_data = segmented_data.sample(frac=1).reset_index(drop=True)
        
            if len(segmented_data) < sample_size:
                continue

            sampled_data = pd.concat(
                [sampled_data, segmented_data.iloc[-sample_size:]], 
                ignore_index=True
            )
     
    sampled_data = sampled_data.sample(frac=1).reset_index(drop=True)
    sampled_data.to_csv(f'data/bca_{name_mapping[target_col]}_train_{segment_condition.lower()}_random_data.csv')
    
    return None

In [11]:
# sample target domain training data for thermal preference dataset based on "Condition"
sample_n_per_user(
    thermalpref_train, 
    target_col='Thermal Preference', 
    segment_condition='Condition'
)

# sample target domain training data for thermal acceptability dataset based on "Condition"
sample_n_per_user(
    thermalacc_train, 
    target_col='Thermal Acceptability', 
    segment_condition='Condition'
)

# sample target domain training data for air movement preference dataset based on "Condition"
sample_n_per_user(
    airpref_train, 
    target_col='Air Movement Preference', 
    segment_condition='Condition'
)

## Randomly sample 1 labelled instance per mode for each participant from BCA dataset for training purposes

In [12]:
# sample target domain training data for thermal preference dataset based on "Mode"
sample_n_per_user(
    thermalpref_train, 
    target_col='Thermal Preference', 
    segment_condition='Mode'
)

# sample target domain training data for thermal acceptability dataset based on "Mode"
sample_n_per_user(
    thermalacc_train, 
    target_col='Thermal Acceptability', 
    segment_condition='Mode'
)

# sample target domain training data for air movement preference dataset based on "Mode"
sample_n_per_user(
    airpref_train, 
    target_col='Air Movement Preference', 
    segment_condition='Mode'
)

## Select 1 labelled instance per model for each participant from BCA dataset using AL approaches for training purposes

In [13]:
thermalpref_train.head()

Unnamed: 0,Mode,Indoor Temp,Indoor Humidity,Air Velocity,Globe Temperature,Outdoor Temp,Outdoor Humidity,Thermal Preference,User Id,Condition
0,NV,28.823541,75.06594,0.09,29.65,90.0,57.0,Cooler,15.0,16.0
1,NV,29.856638,75.988383,0.14,30.45,93.0,56.0,Cooler,39.0,16.0
2,AC,25.198411,67.889935,0.12,26.65,89.0,60.0,No Change,11.0,2.0
3,NV,28.704657,74.409585,0.12,29.65,92.0,55.0,Cooler,55.0,16.0
4,NV,29.341269,74.28266,0.25,29.95,92.0,56.0,Cooler,34.0,19.0


In [14]:
mode_mapping = {'AC':0, 'NV':1}
thermalpref_mapping = {'No Change':0, 'Warmer':1, 'Cooler':2}
thermalacc_mapping = {'Acceptable':0, 'Unacceptable':1}
airpref_mapping = {'No Change':0, 'More':1, 'Less':2}

def activelearning_sampling(bca_data, ashrae_data, target_col):
    """
    Applies the QBC algorithm to sample the most informative training instance from each user
    for transfer learning.
    
    Parameters:
        bca_data: The dataframe containing the labelled training instances from BCA dataset.
        ashrae_data: The dataframe containing the labelled training instances from ASHRAE dataset.
        target_col: The name of the target column.
        
    Returns:
        sampled_data: The dataframe containing the sampled training instances (using AL)
        for all users.
    """
    # perform label encoding of categorical columns
    bca_data['Mode'] = bca_data['Mode'].apply(lambda x: mode_mapping[x])
    ashrae_data['Mode'] = ashrae_data['Mode'].apply(lambda x: mode_mapping[x])
    if target_col == 'Thermal Preference':
        bca_data[target_col] = bca_data[target_col].apply(lambda x: thermalpref_mapping[x])
        ashrae_data[target_col] = ashrae_data[target_col].apply(lambda x: thermalpref_mapping[x])
    elif target_col == 'Thermal Acceptability':
        bca_data[target_col] = bca_data[target_col].apply(lambda x: thermalacc_mapping[x])
        ashrae_data[target_col] = ashrae_data[target_col].apply(lambda x: thermalacc_mapping[x])
    elif target_col == 'Air Movement Preference':
        bca_data[target_col] = bca_data[target_col].apply(lambda x: airpref_mapping[x])
        ashrae_data[target_col] = ashrae_data[target_col].apply(lambda x: airpref_mapping[x])
    else:
        raise ValueError(f'{target_col} is not supported.')
    
    # initialise selection strategy
    X = bca_data[args.input_features]
    y = bca_data[target_col]
    alibox = ToolBox(
        X=X, 
        y=y, 
        query_type='AllLabels'
    )
    selector = alibox.get_query_strategy(strategy_name='QueryInstanceQBC')
    
    # sample initial labelled pool
    bca_labelled_pool = []
    ashrae_labelled_pool = ashrae_data.index.tolist()

    # initialise AL loop
    for user_id in tqdm(bca_data['User Id'].unique()):  
        user_data = bca_data[bca_data['User Id'] == user_id]
        
        for mode in user_data['Mode'].unique():
            mode_idx = user_data[user_data['Mode'] == mode].index.tolist()
            
            # generate different probability distributions by training XGB model on 
            # different subsets of labelled pool
            proba_list = []
            for _ in range(args.num_bagging):
                bca_sampled_pool = random.sample(bca_labelled_pool, int(len(bca_labelled_pool) * 0.8))
                ashrae_sampled_pool = random.sample(ashrae_labelled_pool, int(len(ashrae_labelled_pool) * 0.8))
                
                xgb_model = XGBClassifier(
                    tree_method="hist",
                    use_label_encoder=False,
                    enable_categorical=True
                )

#                 print(f'bca_sampled_pool:{bca_sampled_pool}')
#                 print(f'bca_labelled_pool:{bca_labelled_pool}')

                try:
                    X_bca = X.iloc[bca_sampled_pool]
                    y_bca = y.iloc[bca_sampled_pool]
                    X_ashrae = ashrae_data.loc[ashrae_sampled_pool, args.input_features]
                    y_ashrae = ashrae_data.loc[ashrae_sampled_pool, target_col]
                    X_sampled = pd.concat([X_bca, X_ashrae], ignore_index=True)
                    y_sampled = pd.concat([y_bca, y_ashrae], ignore_index=True)  
                    
                    
#                     print(X_bca.head())
#                     print(X_ashrae.head())
#                     print(y_bca.head())
#                     print(y_ashrae.head())
#                     print(type(X_bca))
#                     print(X_bca.shape)
#                     print(type(y_bca))
#                     print(y_bca.shape)
#                     print(type(X_ashrae))
#                     print(X_ashrae.shape)
#                     print(type(y_ashrae))
#                     print(y_ashrae.shape)
#                     print(type(X_sampled))
#                     print(X_sampled.shape)
#                     print(type(y_sampled))
#                     print(y_sampled.shape)
                    
                    xgb_model.fit(X=X_sampled, y=y_sampled)
                
                except:
                    X_bca = X.iloc[bca_labelled_pool]
                    y_bca = y.iloc[bca_labelled_pool]
                    X_ashrae = ashrae_data.loc[ashrae_sampled_pool, args.input_features]
                    y_ashrae = ashrae_data.loc[ashrae_sampled_pool, target_col]
                    X_sampled = pd.concat([X_bca, X_ashrae], ignore_index=True)
                    y_sampled = pd.concat([y_bca, y_ashrae], ignore_index=True)
                    
#                     print(X_bca.head())
#                     print(X_ashrae.head())
#                     print(y_bca.head())
#                     print(y_ashrae.head())
                    
#                     print(type(X_bca))
#                     print(X_bca.shape)
#                     print(type(y_bca))
#                     print(y_bca.shape)
#                     print(type(X_ashrae))
#                     print(X_ashrae.shape)
#                     print(type(y_ashrae))
#                     print(y_ashrae.shape)
#                     print(type(X_sampled))
#                     print(X_sampled.shape)
#                     print(type(y_sampled))
#                     print(y_sampled.shape)
                    
                    xgb_model.fit(X=X_sampled, y=y_sampled)

                proba_list.append(xgb_model.predict(X.iloc[mode_idx]))

            vote_entropy = selector.calc_vote_entropy(proba_list)
            selected_idx = mode_idx[vote_entropy.index(max(vote_entropy))]

#             print(f'selected_idx: {selected_idx}')

            # update labelled pool
            bca_labelled_pool.append(selected_idx)
            
    return bca_labelled_pool
    

In [15]:
thermalpref_train.head()

Unnamed: 0,Mode,Indoor Temp,Indoor Humidity,Air Velocity,Globe Temperature,Outdoor Temp,Outdoor Humidity,Thermal Preference,User Id,Condition
0,NV,28.823541,75.06594,0.09,29.65,90.0,57.0,Cooler,15.0,16.0
1,NV,29.856638,75.988383,0.14,30.45,93.0,56.0,Cooler,39.0,16.0
2,AC,25.198411,67.889935,0.12,26.65,89.0,60.0,No Change,11.0,2.0
3,NV,28.704657,74.409585,0.12,29.65,92.0,55.0,Cooler,55.0,16.0
4,NV,29.341269,74.28266,0.25,29.95,92.0,56.0,Cooler,34.0,19.0


In [16]:
# sample target domain training data using AL for thermal preference dataset
if not os.path.exists(f'data/bca_thermalpref_train_mode_al_data.csv'):
    thermalpref_train.reset_index(drop=True, inplace=True)
    thermalpref_sample_idx = activelearning_sampling(
        deepcopy(thermalpref_train),
        ashrae_thermalpref,
        target_col='Thermal Preference'
    )
    thermalpref_sample_al = thermalpref_train.iloc[thermalpref_sample_idx]
    thermalpref_sample_al.to_csv('data/bca_thermalpref_train_mode_al_data.csv', index=False)

# sample target domain training data using AL for thermal acceptability dataset
if not os.path.exists(f'data/bca_thermalacc_train_mode_al_data.csv'):
    thermalacc_train.reset_index(drop=True, inplace=True)
    thermalacc_sample_idx = activelearning_sampling(
        deepcopy(thermalacc_train),
        ashrae_thermalacc,
        target_col='Thermal Acceptability'
    )
    thermalacc_sample_al = thermalacc_train.iloc[thermalacc_sample_idx]
    thermalacc_sample_al.to_csv('data/bca_thermalacc_train_mode_al_data.csv', index=False)

# sample target domain training data using AL for air movement preference dataset
if not os.path.exists(f'data/bca_airpref_train_mode_al_data.csv'):
    airpref_train.reset_index(drop=True, inplace=True)
    airpref_sample_idx = activelearning_sampling(
        deepcopy(airpref_train),
        ashrae_airpref,
        target_col='Air Movement Preference'
    )
    airpref_sample_al = airpref_train.iloc[airpref_sample_idx]
    airpref_sample_al.to_csv('data/bca_airpref_train_mode_al_data.csv', index=False)

100%|██████████| 57/57 [11:46<00:00, 12.39s/it]
100%|██████████| 57/57 [03:31<00:00,  3.71s/it]
100%|██████████| 57/57 [08:04<00:00,  8.50s/it]
