In [83]:
import numpy as np
import pandas as pd
from dataprep.eda import create_report
from sklearn.metrics import auc, roc_curve
from sklearn.ensemble import ExtraTreesClassifier

In [84]:
def train_test_split_per_user(data, train_size=0.7):
    users = list(set(data.id))
    users = sorted(users, reverse=True)  # fix randomness
    total_users = len(users)
    slice = int(train_size * total_users)
    users_train = users[:slice]
    users_test = users[slice:]
    return data[data.id.isin(users_train)], data[data.id.isin(users_test)]

def get_cardio_category(gender, age, vo2max):
    if pd.isna(gender):
        return np.nan
    if gender == "MALE":
        if age == "<30":
            if vo2max >= 51.1:
                return "Superior/Excellent"
            elif vo2max >= 41.7:
                return "Fair/Good"
            else:
                return "Poor"
        else:
            if vo2max >= 48.3:
                return "Superior/Excellent"
            elif vo2max >= 40.5:
                return "Fair/Good"
            else:
                return "Poor"
    else:
        if age == "<30":
            if vo2max >= 43.9:
                return "Superior/Excellent"
            elif vo2max >= 36.1:
                return "Fair/Good"
            else:
                return "Poor"
        else:
            if vo2max >= 42.4:
                return "Superior/Excellent"
            elif vo2max >= 34.4:
                return "Fair/Good"
            else:
                return "Poor"

In [85]:
df=pd.read_pickle('../data/unified_dataframe/dataframe_demographics_preprocessed.pkl')
df = df[df['filteredDemographicVO2Max'].notna()]
df.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,Steps_hour14,Steps_hour15,Steps_hour16,Steps_hour17,Steps_hour18,Steps_hour19,Steps_hour20,Steps_hour21,Steps_hour22,Steps_hour23
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,95.9,14.8,78.0,0.833333,0.675,...,191.0,33.0,342.0,1712.0,1838.0,160.0,155.0,37.0,31.0,53.0
1,621e2e8e67b776a24055b564,2021-05-25,33.794544,57.681,94.303,95.9,15.8,80.0,0.833333,0.725,...,120.0,24.0,446.0,377.0,3165.0,332.0,97.0,9.0,24.0,58.0
2,621e2e8e67b776a24055b564,2021-05-26,34.611011,57.481,119.212,95.9,14.6,84.0,0.966667,0.725,...,85.0,477.0,390.0,2821.0,293.0,158.0,66.0,0.0,70.0,74.0
3,621e2e8e67b776a24055b564,2021-05-27,34.408304,57.493,111.709,95.9,14.8,82.0,0.933333,0.725,...,58.0,296.0,401.0,21.0,3054.0,503.0,411.0,96.0,63.0,8.0
4,621e2e8e67b776a24055b564,2021-05-28,34.178922,56.75,103.034,95.9,15.2,81.0,0.866667,0.725,...,250.0,82.0,1363.0,3014.0,81.0,104.0,1984.0,18.0,141.0,60.0


In [86]:
df['fitness_level'] = df.apply(lambda row: get_cardio_category(row["gender"], row['age'], row["filteredDemographicVO2Max"]), axis=1)
df.fitness_level.astype(str)
df.head()

Unnamed: 0,id,date,nightly_temperature,nremhr,rmssd,spo2,full_sleep_breathing_rate,stress_score,sleep_points_percentage,exertion_points_percentage,...,Steps_hour15,Steps_hour16,Steps_hour17,Steps_hour18,Steps_hour19,Steps_hour20,Steps_hour21,Steps_hour22,Steps_hour23,fitness_level
0,621e2e8e67b776a24055b564,2021-05-24,34.137687,57.432,89.603,95.9,14.8,78.0,0.833333,0.675,...,33.0,342.0,1712.0,1838.0,160.0,155.0,37.0,31.0,53.0,Superior/Excellent
1,621e2e8e67b776a24055b564,2021-05-25,33.794544,57.681,94.303,95.9,15.8,80.0,0.833333,0.725,...,24.0,446.0,377.0,3165.0,332.0,97.0,9.0,24.0,58.0,Superior/Excellent
2,621e2e8e67b776a24055b564,2021-05-26,34.611011,57.481,119.212,95.9,14.6,84.0,0.966667,0.725,...,477.0,390.0,2821.0,293.0,158.0,66.0,0.0,70.0,74.0,Superior/Excellent
3,621e2e8e67b776a24055b564,2021-05-27,34.408304,57.493,111.709,95.9,14.8,82.0,0.933333,0.725,...,296.0,401.0,21.0,3054.0,503.0,411.0,96.0,63.0,8.0,Superior/Excellent
4,621e2e8e67b776a24055b564,2021-05-28,34.178922,56.75,103.034,95.9,15.2,81.0,0.866667,0.725,...,82.0,1363.0,3014.0,81.0,104.0,1984.0,18.0,141.0,60.0,Superior/Excellent


In [87]:
fitness_level = df.drop(columns=['date','age', 'gender', 'bmi','filteredDemographicVO2Max'])

In [88]:
fitness_level['fitness_level'].value_counts()

Superior/Excellent    2676
Fair/Good              912
Poor                    95
Name: fitness_level, dtype: int64

In [89]:
fitness_level['fitness_level'].replace(to_replace=['Superior/Excellent', 'Fair/Good', 'Poor'], value=[2, 1, 0], inplace=True)

In [90]:
print(fitness_level['fitness_level'].isna().sum())
fitness_level = fitness_level[fitness_level['fitness_level'].notna()]
print(fitness_level['fitness_level'].isna().sum())

193
0


In [91]:
drop_columns =  fitness_level.columns[fitness_level.isna().any()].tolist()
fitness_level.drop(columns=drop_columns, inplace=True)

In [92]:
train_data, test_data = train_test_split_per_user(fitness_level)
fold_groups = train_data.id
train_data = train_data.drop(columns=['id'])
test_data = test_data.drop(columns=['id'])

In [93]:
et = ExtraTreesClassifier(n_estimators=100, random_state=0)

In [94]:
x_train = train_data.drop(columns=['fitness_level'])
y_train = train_data['fitness_level']
x_test = test_data.drop(columns=['fitness_level'])
y_test = test_data['fitness_level']

In [95]:
et = et.fit(x_train, y_train)

In [96]:
y_pred = et.predict(x_test)

In [97]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=2)
auc(fpr, tpr)

0.4868212819202369