In [1]:
# Pandas, pickle, pandas, numpy
import pickle
import numpy as np
import pandas as pd

# Sklearn
from sklearn import linear_model 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder


# Loading and Cleaning

In [2]:
# df = pd.read_csv("../datasets/TCS_65_participants_outsideData.csv", delimiter = ",")
df = pd.read_csv("../datasets/TCS_80_participants_outsideData.csv", delimiter = ",")

# Eliminate some columns we won't be using for now
del df['timestamp']
del df['Day']
del df['Time Broken']
del df['Day_Time']
del df['Text(Day_Time)']
del df['Total Seconds']
del df['Diff Seconds']
del df['Temperature']
del df['ThermalComfort']
del df['TopClothing']
del df['BottomClothing']
del df['OuterLayerClothing']
del df['ActivityDescription']
del df['Thermal Comfort TA']

df.reset_index()# reset index
df = df.fillna(0) # fill NaN with 0


In [3]:
as_many_participants = True

if as_many_participants:
    df = df[df['Participant_No'] != 9] # no variability in zone temperature, error in room thermostat
#     df = df[df['Participant_No'] != 10] # no skin temperature (no wearable data)
    df = df[df['Participant_No'] != 13] # no zone temperature 
#     df = df[df['Participant_No'] != 26] # no skin temperature (no wearable data)
#     df = df[df['Participant_No'] != 28] # no skin temperature (no wearable data)
#     df = df[df['Participant_No'] != 36] # no skin temperature (no wearable data)
#     df = df[df['Participant_No'] != 37] # no skin temperature (no wearable data)
    df = df[df['Participant_No'] != 38] # no subject exist, the number was skipped
#     df = df[df['Participant_No'] != 39] # no skin temperature (no wearable data)
#     df = df[df['Participant_No'] != 47] # no skin temperature (no wearable data)
#     df = df[df['Participant_No'] != 48] # no skin temperature (no wearable data)
#     df = df[df['Participant_No'] !=  53] # no skin temperature (no wearable data)
    

    # randomize dataset
    df_lm = df[df['class'] == 'SurveyData']
    df_lm = df_lm.sample(frac = 1).reset_index(drop = True)
    
    # encode categorical variables
    for col in df_lm.columns.values:
        le = LabelEncoder()
        if df_lm[col].dtypes == 'object':
            df_lm[col] = df_lm[col].astype(str)
            le.fit(df_lm[col])
            df_lm[col]=le.transform(df_lm[col])

    del df_lm['Participant_No']
    del df_lm['class']
    del df_lm['MinuteCalories']
    
#     del df_lm['Activity']

    X = np.array(df_lm.iloc[:, 0:df_lm.shape[1] - 1])
#     X = np.array(df_lm.iloc[:, 0:len(list(df_lm)[:-1])])
    # scale data
    scaled_X = preprocessing.scale(X)
    
    y = np.array(df_lm['Discrete Thermal Comfort_TA'])

    lm = linear_model.LinearRegression()  
    lm.fit(X, y) 
    # lm.fit(scaled_X, y)

    df_lm_coef = pd.DataFrame(list(zip(df_lm.columns, lm.coef_)), columns = ['features', 'estimatedCoefficients'])
    
    # Heuristic: skin temperature = room temperature + k
    heuristic_df = df.copy()
    heuristic_df = heuristic_df.replace(0, np.NaN)
    k_mean = (heuristic_df['SkinTemperature'] - heuristic_df['Temperature (Fahrenheit)']).mean()
    k_std =  (heuristic_df['SkinTemperature'] - heuristic_df['Temperature (Fahrenheit)']).std()
    
    # for each participant that doesn't have skin temperature, calculate the heuristic
    for p in [10, 21, 26, 27, 28, 35, 36, 37, 39, 47, 48, 53, 60, 64, 73]:
        # filter current participant
        curr_p = df[df['Participant_No'] == p]
        
        # calculate new skin temperature for this participant: skin temp = room temp + k
        heuristic_skin_temp = curr_p['Temperature (Fahrenheit)'].apply(lambda x : x +
                                                                       np.random.normal(k_mean, k_std, 1))
        # update the values in the dataframe
        curr_p_indices = curr_p.index.values
        df.loc[curr_p_indices,'SkinTemperature'] = heuristic_skin_temp.astype(float)

        df.to_csv("../datasets/TCS_80_participants_outsideData_heuristicST.csv")
                
num_participants = df['Participant_No'].unique()
print("\n")
print("Number of participants: {}".format(len(num_participants)))




Number of participants: 77


In [4]:
# print df_lm_coef

df_lm_coef.sort_values(['estimatedCoefficients'], ascending=False)
    

Unnamed: 0,features,estimatedCoefficients
0,Temperature (Fahrenheit),0.08507353
6,Shoulder Circumference(cm),0.01276668
4,Activity,0.01196414
10,Humidity_outside,0.001863464
9,Temperature_outside,0.0002291761
1,Gsr,6.064483e-07
5,Height(cm),-0.0004656155
7,Weight(lbs),-0.002106426
2,SkinTemperature,-0.002836217
8,Gender,-0.05259315


In [None]:
print(df.groupby('Gender')['Participant_No'].nunique())
print("\n")
# print(df['Discrete Thermal Comfort_TA'].value_counts())


# Create band and survey datasets

In [None]:
# creeate two datasets based on type of instances
df_survey = df[df['class'] == 'SurveyData']
df_band = df[df['class'] == 'BandData']

del df_band['class']
del df_survey['class']

print("Band dataset size: {}".format(df_band.shape))
print("Survey dataset size: {}".format(df_survey.shape))
#print(df_band)



def f_to_c(x):
    x = (x - 32) * 5.0 / 9.0
    return float(x)
df_description = df_survey.copy()
df_description['SkinT_C'] = df_survey['SkinTemperature'].apply(f_to_c)
df_description['Temperature_C'] = df_survey['Temperature (Fahrenheit)'].apply(f_to_c)
df_description['Temperature_outside_C'] = df_survey['Temperature_outside'].apply(f_to_c)

print(df_description.describe(include = 'all'))
# print(df_survey['SkinTemperature'])

print(df_survey['Discrete Thermal Comfort_TA'].value_counts())


# Feature Selection

In [None]:
df_encoded = df_survey.copy()

# encoding categorical variables
for col in df_encoded.columns.values:
    le = LabelEncoder()
    if df_encoded[col].dtypes == 'object':
        df_encoded[col] = df_encoded[col].astype(str)
        le.fit(df_encoded[col])
        df_encoded[col]=le.transform(df_encoded[col])


In [None]:
# Feature set 1
df_feature1 = df_encoded.copy()
del df_feature1['Gsr']
del df_feature1['Activity']
del df_feature1['MinuteCalories']

print("Feature Set 1: ", df_feature1.columns.values[:-1]) # minus the last element, the comfort label
print( "Number of features in this set: ", df_feature1.shape[1] - 1) # minus 1 for the comfort label


In [None]:
# Feature set 2
df_feature2 =  df_feature1.copy()
del df_feature2['Height(cm)']
del df_feature2['Shoulder Circumference(cm)']
del df_feature2['Weight(lbs)']

print("Feature Set 2: ", df_feature2.columns.values[:-1]) # minus the last element, the comfort label
print( "Number of features in this set: ", df_feature2.shape[1] - 1) # minus 1 for the comfort label


In [None]:
# Feature set 3
df_feature3 =  df_feature1.copy()
del df_feature3['SkinTemperature']
del df_feature3['ClothingInsulation']
del df_feature3['Gender']

print("Feature Set 3: ", df_feature3.columns.values[:-1]) # minus the last element, the comfort label
print( "Number of features in this set: ", df_feature3.shape[1] - 1) # minus 1 for the comfort label


In [None]:
# Feature set 4
df_feature4 =  df_feature3.copy()
del df_feature4['Height(cm)']
del df_feature4['Shoulder Circumference(cm)']
del df_feature4['Weight(lbs)']

print("Feature Set 4: ", df_feature4.columns.values[:-1]) # minus the last element, the comfort label
print( "Number of features in this set: ", df_feature4.shape[1] - 1) # minus 1 for the comfort label


In [None]:
# Feature set 5
df_feature5 =  df_feature4.copy()
del df_feature5['Temperature_outside']
del df_feature5['Humidity_outside']

print("Feature Set 3: ", df_feature5.columns.values[:-1]) # minus the last element, the comfort label
print( "Number of features in this set: ", df_feature5.shape[1] - 1) # minus 1 for the comfort label


# Save dataframes as pickle

In [None]:
# fileName = "df_feature1"
# # open the file for writing
# fileObject = open(fileName,'wb') 

# pickle.dump(df_feature1, fileObject)
# fileObject.close()

dataframes = [df_feature1, df_feature2, df_feature3, df_feature4, df_feature5]

for i in range(1, 6):
    fileName = "df_feature" + str(i) + ".pkl"
#     # open the file for writing
#     fileObject = open(fileName,'wb') 
#     # write the respective file
#     pickle.dump(dataframes[i - 1], fileObject)
#     # close the file
#     fileObject.close()
    
    dataframes[i - 1].to_pickle(fileName)
