In [None]:
import pickle
import pycaret
import pandas as pd
import seaborn as sns
from pycaret.classification import *
from dataprep.eda import create_report

-----------------------------
# functions

In [None]:
def train_test_split_per_user(data, train_size=0.7):
    users = list(set(data.id))
    users = sorted(users, reverse=True)  # fix randomness
    total_users = len(users)
    slice = int(train_size * total_users)
    users_train = users[:slice]
    users_test = users[slice:]
    return data[data.id.isin(users_train)], data[data.id.isin(users_test)]

# Apply Recursive Feature Elimination and return the new dataframes
def feature_elimination(x_train_df, x_test_df, y_train_df):
    # Convert numeric to float
    x_train_df = x_train_df.astype(np.float_)
    x_test_df = x_test_df.astype(np.float_)

    # RFE
    estimator = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, 
                                           random_state=0)
    selector = RFE(estimator, n_features_to_select=10, step=1)
    selector = selector.fit(x_train_df, y_train_df)
    x_train_df = selector.transform(x_train_df)
    x_test_df = selector.transform(x_test_df)
    
    # rename columns
    x_train_df = pd.DataFrame(x_train_df)
    x_train_df.columns = selector.get_feature_names_out()
    x_test_df = pd.DataFrame(x_test_df)
    x_test_df.columns = selector.get_feature_names_out()

    return x_train_df, x_test_df

-----------------
# Predicting semas negative feelings

In [None]:
df=pd.read_pickle('../data/unified_dataframe/data_unprocessed.pkl')
df.head()

In [None]:
df.columns

In [None]:
sema = df.drop(columns=['date','startTime', 'endTime', 'label_ttm_stage', 
                        'label_breq_self_determination','label_ipip_extraversion_category',
                        'label_ipip_agreeableness_category',
                        'label_ipip_conscientiousness_category',
                        'label_ipip_stability_category', 'label_ipip_intellect_category',
                        'label_stai_stress_category', 'label_panas_negative_affect'])

In [None]:
train_data, test_data = train_test_split_per_user(sema)

In [None]:
fold_groups = train_data.id
train_data = train_data.drop(columns=['id'])
test_data = test_data.drop(columns=['id'])

In [None]:
s = setup(data=train_data, target='label_sema_negative_feelings', silent=True, session_id=123, 
          fold_strategy='groupkfold', fold=3,  fold_groups=fold_groups, test_data=test_data)

In [None]:
best = compare_models()

In [None]:
df=pd.read_pickle('../data/unified_dataframe/data_preprocessed.pkl')

sema = df.drop(columns=['date','startTime', 'endTime', 'label_ttm_stage', 
                        'label_breq_self_determination','label_ipip_extraversion_category',
                        'label_ipip_agreeableness_category',
                        'label_ipip_conscientiousness_category',
                        'label_ipip_stability_category', 'label_ipip_intellect_category',
                        'label_stai_stress_category', 'label_panas_negative_affect'])

train_data, test_data = train_test_split_per_user(sema)

fold_groups = train_data.id
train_data = train_data.drop(columns=['id'])
test_data = test_data.drop(columns=['id'])

In [None]:
s = setup(data=train_data, target='label_sema_negative_feelings', silent=True, session_id=123, 
          fold_strategy='groupkfold', fold=3,  fold_groups=fold_groups, test_data=test_data)

In [None]:
best = compare_models()

When remove_multicollinearity is set to True, the variables with inter-correlations higher than the threshold defined under the multicollinearity_threshold param are dropped. When two features are highly correlated with each other, the feature that is less correlated with the target variable is dropped.

In [None]:
s = setup(data=train_data, target='label_sema_negative_feelings', silent=True, session_id=123, 
          fold_strategy='groupkfold', fold=3,  fold_groups=fold_groups, test_data=test_data, 
          normalize=True, fix_imbalance=True, transformation=True, remove_multicollinearity = True,
          multicollinearity_threshold = 0.6)

In [None]:
best = compare_models()

In [None]:
df=pd.read_pickle('../data/unified_dataframe/data_features.pkl')

In [None]:
df['bmi'].replace(to_replace=['Normal', 'Overweight', 'Underweight', 'Obese'], value=[0, 1, 2, 3], 
                  inplace=True)

sema = df.drop(columns=['date','startTime', 'endTime', 'label_ttm_stage', 
                        'label_breq_self_determination','label_ipip_extraversion_category',
                        'label_ipip_agreeableness_category',
                        'label_ipip_conscientiousness_category',
                        'label_ipip_stability_category', 'label_ipip_intellect_category',
                        'label_stai_stress_category', 'label_panas_negative_affect'])

train_data, test_data = train_test_split_per_user(sema)

fold_groups = train_data.id
train_data = train_data.drop(columns=['id'])
test_data = test_data.drop(columns=['id'])

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

x = train_data.drop(columns='label_sema_negative_feelings')
y = train_data['label_sema_negative_feelings']
model = ExtraTreesClassifier()
model.fit(x,y)
print('Model feature importances')
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

In [None]:
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt

# Load the digits dataset
X = train_data.drop(columns='label_sema_negative_feelings')
y = train_data['label_sema_negative_feelings']

In [None]:
# Create the RFE object and rank each pixel
svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features_to_select=1, step=1)
rfe.fit(X, y)
ranking = rfe.ranking_.reshape(digits.images[0].shape)

**feature_selection:** bool, default = False
When set to True, a subset of features are selected using a combination of various permutation importance techniques including Random Forest, Adaboost and Linear correlation with target variable. The size of the subset is dependent on the feature_selection_param. Generally, this is used to constrain the feature space in order to improve efficiency in modeling. When polynomial_features and feature_interaction are used, it is highly recommended to define the feature_selection_threshold param with a lower value.

**feature_selection_threshold:** float, default = 0.8
Threshold used for feature selection (including newly created polynomial features). A higher value will result in a higher feature space. It is recommended to do multiple trials with different values of feature_selection_threshold specially in cases where polynomial_features and feature_interaction are used. Setting a very low value may be efficient but could result in under-fitting.

In [None]:
s = setup(data=train_data, target='label_sema_negative_feelings', silent=True, session_id=123, 
          fold_strategy='groupkfold', fold=3,  fold_groups=fold_groups, test_data=test_data, 
          normalize=True, fix_imbalance=True, transformation=True, remove_multicollinearity = True,
          multicollinearity_threshold = 0.6, feature_selection=True, 
          feature_selection_threshold=0.1)