In [12]:
import pickle
import pycaret
import pandas as pd
import seaborn as sns
from pycaret.classification import *

-----------------------------
# functions

In [13]:
def train_test_split_per_user(data, train_size=0.7):
    users = list(set(data.id))
    users = sorted(users, reverse=True)  # fix randomness
    total_users = len(users)
    slice = int(train_size * total_users)
    users_train = users[:slice]
    users_test = users[slice:]
    return data[data.id.isin(users_train)], data[data.id.isin(users_test)]

# Apply Recursive Feature Elimination and return the new dataframes
def feature_elimination(x_train_df, x_test_df, y_train_df):
    # Convert numeric to float
    x_train_df = x_train_df.astype(np.float_)
    x_test_df = x_test_df.astype(np.float_)

    # RFE
    estimator = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, 
                                           random_state=0)
    selector = RFE(estimator, n_features_to_select=10, step=1)
    selector = selector.fit(x_train_df, y_train_df)
    x_train_df = selector.transform(x_train_df)
    x_test_df = selector.transform(x_test_df)
    
    # rename columns
    x_train_df = pd.DataFrame(x_train_df)
    x_train_df.columns = selector.get_feature_names_out()
    x_test_df = pd.DataFrame(x_test_df)
    x_test_df.columns = selector.get_feature_names_out()

    return x_train_df, x_test_df

-----------------
# Predicting semas negative feelings

In [14]:
df=pd.read_pickle('../data/unified_dataframe/data_unprocessed.pkl')
df.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,responsiveness_points_percentage,daily_temperature_variation,badgeType,calories,filteredDemographicVO2Max,distance,activityType,bpm,lightly_active_minutes,moderately_active_minutes,very_active_minutes,sedentary_minutes,mindfulness_session,scl_avg,resting_hr,sleep_duration,minutesToFallAsleep,minutesAsleep,minutesAwake,minutesAfterWakeup,sleep_efficiency,sleep_deep_ratio,sleep_wake_ratio,sleep_light_ratio,sleep_rem_ratio,steps,minutes_in_default_zone_1,minutes_below_default_zone_1,minutes_in_default_zone_2,minutes_in_default_zone_3,age,gender,bmi,heart_rate_alert,startTime,endTime,label_ttm_stage,label_breq_self_determination,label_sema_negative_feelings,label_ipip_extraversion_category,label_ipip_agreeableness_category,label_ipip_conscientiousness_category,label_ipip_stability_category,label_ipip_intellect_category,label_stai_stress_category,label_panas_negative_affect
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,,14.8,78.0,0.833333,0.675,0.866667,-1.788325,,2351.59,62.7921,6517.5,[Walk],71.701565,149.0,24.0,33.0,713.0,False,,62.07307,31260000.0,0.0,445.0,76.0,0.0,93.0,1.243243,0.987013,0.921642,1.341772,8833.0,83.0,1349.0,0.0,0.0,<30,MALE,<19,,2021-05-24T00:40:00.000,2021-05-24T09:21:00.000,Maintenance,intrinsic_regulation,,LOW,LOW,HIGH,HIGH,AVERAGE,,
1,621e328667b776a240281372,2021-05-24,,,,,,,,,,,,2619.85,41.38447,6568.9,[Walk],83.919698,132.0,7.0,30.0,1271.0,False,,,,,,,,,,,,,8550.0,278.0,766.0,29.0,1.0,>=30,MALE,>=30,,,,Maintenance,intrinsic_regulation,,HIGH,AVERAGE,HIGH,LOW,LOW,,
2,621e326767b776a24012e179,2021-05-24,,46.12,53.968,95.3,14.4,80.0,0.666667,0.925,0.766667,-4.129593,,2204.5,52.76058,11283.7,[Walk],68.275766,278.0,41.0,67.0,616.0,False,,52.516339,,,,,,,,,,,16992.0,131.0,1225.0,0.0,0.0,<30,FEMALE,<19,,,,Maintenance,,,,,,,,,
3,621e332267b776a24092a584,2021-05-24,,,,,,,,,,,,3792.13,53.53183,16776.8,[Walk],78.842893,190.0,79.0,108.0,1063.0,False,,60.977217,,,,,,,,,,,21284.0,175.0,1130.0,9.0,0.0,<30,MALE,21.0,,,,Maintenance,identified_regulation,,LOW,LOW,LOW,AVERAGE,HIGH,,
4,621e333567b776a240a0c217,2021-05-24,,,,,,,,,,,[LIFETIME_DISTANCE],,,,,,,,,,False,,,,,,,,,,,,,,,,,,<30,MALE,21.0,,,,Contemplation,intrinsic_regulation,,AVERAGE,HIGH,AVERAGE,LOW,LOW,,


In [15]:
df.columns

Index(['id', 'date', 'nightly_temperature', 'nremhr', 'rmssd', 'spo2',
       'full_sleep_breathing_rate', 'stress_score', 'sleep_points_percentage',
       'exertion_points_percentage', 'responsiveness_points_percentage',
       'daily_temperature_variation', 'badgeType', 'calories',
       'filteredDemographicVO2Max', 'distance', 'activityType', 'bpm',
       'lightly_active_minutes', 'moderately_active_minutes',
       'very_active_minutes', 'sedentary_minutes', 'mindfulness_session',
       'scl_avg', 'resting_hr', 'sleep_duration', 'minutesToFallAsleep',
       'minutesAsleep', 'minutesAwake', 'minutesAfterWakeup',
       'sleep_efficiency', 'sleep_deep_ratio', 'sleep_wake_ratio',
       'sleep_light_ratio', 'sleep_rem_ratio', 'steps',
       'minutes_in_default_zone_1', 'minutes_below_default_zone_1',
       'minutes_in_default_zone_2', 'minutes_in_default_zone_3', 'age',
       'gender', 'bmi', 'heart_rate_alert', 'startTime', 'endTime',
       'label_ttm_stage', 'label_breq_sel

In [16]:
sema = df.drop(columns=['date','startTime', 'endTime', 'label_ttm_stage', 
                        'label_breq_self_determination','label_ipip_extraversion_category',
                        'label_ipip_agreeableness_category',
                        'label_ipip_conscientiousness_category',
                        'label_ipip_stability_category', 'label_ipip_intellect_category',
                        'label_stai_stress_category', 'label_panas_negative_affect'])

In [17]:
train_data, test_data = train_test_split_per_user(sema)

In [18]:
fold_groups = train_data.id
train_data = train_data.drop(columns=['id'])
test_data = test_data.drop(columns=['id'])

In [19]:
s = setup(data=train_data, target='label_sema_negative_feelings', silent=True, session_id=123, 
          fold_strategy='groupkfold', fold=3,  fold_groups=fold_groups, test_data=test_data)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,label_sema_negative_feelings
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(5199, 43)"
5,Missing Values,True
6,Numeric Features,33
7,Categorical Features,9
8,Ordinal Features,False
9,High Cardinality Features,False


In [20]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.5459,0.5164,0.3278,0.5017,0.3808,0.0603,0.0645,0.54
lr,Logistic Regression,0.5456,0.5204,0.0106,0.1667,0.0199,0.0023,0.0061,1.4167
dummy,Dummy Classifier,0.5456,0.5,0.0,0.0,0.0,0.0,0.0,0.0533
lightgbm,Light Gradient Boosting Machine,0.5282,0.5099,0.3588,0.4741,0.4022,0.0298,0.0305,0.5533
rf,Random Forest Classifier,0.5185,0.5034,0.3982,0.4678,0.4236,0.0224,0.0231,0.24
knn,K Neighbors Classifier,0.514,0.5155,0.4433,0.4629,0.4509,0.0181,0.0179,0.9233
ada,Ada Boost Classifier,0.5105,0.495,0.3051,0.4424,0.3523,-0.011,-0.0127,0.1967
svm,SVM - Linear Kernel,0.5099,0.0,0.3333,0.1488,0.2058,0.0,0.0,0.09
ridge,Ridge Classifier,0.5087,0.0,0.3729,0.4527,0.4062,-0.0049,-0.0046,0.06
nb,Naive Bayes,0.5084,0.5266,0.5546,0.4666,0.5039,0.026,0.0268,1.03


In [21]:
df=pd.read_pickle('../data/unified_dataframe/data_preprocessed.pkl')

sema = df.drop(columns=['date','startTime', 'endTime', 'label_ttm_stage', 
                        'label_breq_self_determination','label_ipip_extraversion_category',
                        'label_ipip_agreeableness_category',
                        'label_ipip_conscientiousness_category',
                        'label_ipip_stability_category', 'label_ipip_intellect_category',
                        'label_stai_stress_category', 'label_panas_negative_affect'])

train_data, test_data = train_test_split_per_user(sema)

fold_groups = train_data.id
train_data = train_data.drop(columns=['id'])
test_data = test_data.drop(columns=['id'])

In [22]:
s = setup(data=train_data, target='label_sema_negative_feelings', silent=True, session_id=123, 
          fold_strategy='groupkfold', fold=3,  fold_groups=fold_groups, test_data=test_data)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,label_sema_negative_feelings
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(5199, 74)"
5,Missing Values,True
6,Numeric Features,44
7,Categorical Features,29
8,Ordinal Features,False
9,High Cardinality Features,False


In [23]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.5456,0.5235,0.0,0.0,0.0,0.0,0.0,0.02
dummy,Dummy Classifier,0.5456,0.5,0.0,0.0,0.0,0.0,0.0,0.0133
dt,Decision Tree Classifier,0.53,0.5258,0.4331,0.478,0.4487,0.0437,0.0442,0.0333
knn,K Neighbors Classifier,0.5229,0.5222,0.4484,0.4727,0.4592,0.0345,0.0343,0.0567
gbc,Gradient Boosting Classifier,0.5208,0.5118,0.4208,0.4782,0.4343,0.0267,0.0293,0.59
nb,Naive Bayes,0.5187,0.5318,0.4585,0.4734,0.4605,0.0237,0.0246,0.02
rf,Random Forest Classifier,0.5176,0.5041,0.3526,0.4662,0.3898,0.0161,0.0172,0.2133
et,Extra Trees Classifier,0.5102,0.4951,0.2759,0.4485,0.3335,-0.0131,-0.0119,0.1533
svm,SVM - Linear Kernel,0.5078,0.0,0.6667,0.3144,0.4271,0.0,0.0,0.0367
lightgbm,Light Gradient Boosting Machine,0.5063,0.5056,0.4438,0.4582,0.4399,0.0062,0.008,0.2967


When remove_multicollinearity is set to True, the variables with inter-correlations higher than the threshold defined under the multicollinearity_threshold param are dropped. When two features are highly correlated with each other, the feature that is less correlated with the target variable is dropped.

In [24]:
s = setup(data=train_data, target='label_sema_negative_feelings', silent=True, session_id=123, 
          fold_strategy='groupkfold', fold=3,  fold_groups=fold_groups, test_data=test_data, 
          normalize=True, fix_imbalance=True, transformation=True, remove_multicollinearity = True,
          multicollinearity_threshold = 0.6)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,label_sema_negative_feelings
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(5199, 74)"
5,Missing Values,True
6,Numeric Features,44
7,Categorical Features,29
8,Ordinal Features,False
9,High Cardinality Features,False


In [25]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dummy,Dummy Classifier,0.5456,0.5,0.0,0.0,0.0,0.0,0.0,0.0567
gbc,Gradient Boosting Classifier,0.5297,0.5338,0.49,0.4867,0.4767,0.0522,0.0543,0.4567
rf,Random Forest Classifier,0.5217,0.5156,0.4574,0.4767,0.4608,0.0383,0.0392,0.2233
et,Extra Trees Classifier,0.5131,0.5213,0.454,0.4635,0.4545,0.0203,0.0208,0.2167
lightgbm,Light Gradient Boosting Machine,0.5125,0.5057,0.4946,0.4669,0.4746,0.0219,0.0224,0.33
svm,SVM - Linear Kernel,0.5084,0.0,0.5496,0.4653,0.5024,0.0211,0.0211,0.0867
dt,Decision Tree Classifier,0.5052,0.4977,0.4406,0.4549,0.4466,0.001,0.001,0.09
ada,Ada Boost Classifier,0.5022,0.4989,0.5395,0.4603,0.4804,0.014,0.0135,0.1833
knn,K Neighbors Classifier,0.4996,0.508,0.5306,0.4568,0.4903,0.0039,0.0037,0.2533
lr,Logistic Regression,0.4919,0.5012,0.5306,0.4496,0.4678,0.0006,0.0017,0.0767


In [33]:
df=pd.read_pickle('../data/unified_dataframe/data_features.pkl')

In [33]:
df['bmi'].replace(to_replace=['Normal', 'Overweight', 'Underweight', 'Obese'], value=[0, 1, 2, 3], 
                  inplace=True)

sema = df.drop(columns=['date','startTime', 'endTime', 'label_ttm_stage', 
                        'label_breq_self_determination','label_ipip_extraversion_category',
                        'label_ipip_agreeableness_category',
                        'label_ipip_conscientiousness_category',
                        'label_ipip_stability_category', 'label_ipip_intellect_category',
                        'label_stai_stress_category', 'label_panas_negative_affect'])

train_data, test_data = train_test_split_per_user(sema)

fold_groups = train_data.id
train_data = train_data.drop(columns=['id'])
test_data = test_data.drop(columns=['id'])

In [36]:
from sklearn.ensemble import ExtraTreesClassifier

x = train_data.drop(columns='label_sema_negative_feelings')
y = train_data['label_sema_negative_feelings']
model = ExtraTreesClassifier()
model.fit(x,y)
print('Model feature importances')
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [31]:
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

# Load the digits dataset
X = train_data.drop(columns='label_sema_negative_feelings')
y = train_data['label_sema_negative_feelings']

In [32]:
# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_.reshape(digits.images[0].shape)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

**feature_selection:** bool, default = False
When set to True, a subset of features are selected using a combination of various permutation importance techniques including Random Forest, Adaboost and Linear correlation with target variable. The size of the subset is dependent on the feature_selection_param. Generally, this is used to constrain the feature space in order to improve efficiency in modeling. When polynomial_features and feature_interaction are used, it is highly recommended to define the feature_selection_threshold param with a lower value.

**feature_selection_threshold:** float, default = 0.8
Threshold used for feature selection (including newly created polynomial features). A higher value will result in a higher feature space. It is recommended to do multiple trials with different values of feature_selection_threshold specially in cases where polynomial_features and feature_interaction are used. Setting a very low value may be efficient but could result in under-fitting.

In [None]:
s = setup(data=train_data, target='label_sema_negative_feelings', silent=True, session_id=123, 
          fold_strategy='groupkfold', fold=3,  fold_groups=fold_groups, test_data=test_data, 
          normalize=True, fix_imbalance=True, transformation=True, remove_multicollinearity = True,
          multicollinearity_threshold = 0.6, feature_selection=True, 
          feature_selection_threshold=0.1)