In [2]:
import pandas as pd

In [3]:
result_df = pd.read_csv('../data_preprocessed_ts/train_label/y_concated_regression.csv')

In [4]:
# S1: TST
result_df["total_sleep_time"] = result_df["deepsleepduration"] + result_df["lightsleepduration"] + result_df["remsleepduration"]

# S2: sleepEfficiency
result_df["sleep_efficiency"] = (result_df["deepsleepduration"] + result_df["lightsleepduration"] + result_df["remsleepduration"]) / (result_df["wakeupduration"] + result_df["deepsleepduration"] + result_df["lightsleepduration"] + result_df["remsleepduration"]) * 100

# S3: sleepLatency: durationtosleep

# S4: WASO
result_df["waso"] = result_df["wakeupduration"] - result_df["durationtosleep"] - result_df["durationtowakeup"]

In [5]:
def calculate_mean_and_compare(df, group_col, target_col):
    """
    각 유저별로 주어진 컬럼의 평균을 계산하고, 그 평균을 기준으로 원래 값이 높으면 1, 낮으면 0을 할당합니다.
    이 때, 원본 데이터프레임은 변경되지 않습니다.
    
    Parameters:
    df (pandas.DataFrame): 데이터를 포함하고 있는 데이터프레임
    group_col (str): 그룹화할 컬럼의 이름
    target_col (str): 평균을 계산할 대상 컬럼의 이름
    
    Returns:
    pandas.DataFrame: 평균 값과 비교 결과를 포함한 새로운 데이터프레임
    """
    # 데이터프레임을 복사하여 원본 데이터 보호
    df_copy = df.copy()

    # 그룹별로 target_col의 평균을 계산
    mean_series = df_copy.groupby(group_col)[target_col].mean()
    
    # 평균을 기존 데이터프레임에 결합
    df_copy = df_copy.join(mean_series, on=group_col, rsuffix='_mean')
    
    # 평균보다 높은지 여부를 계산하여 새로운 컬럼에 할당
    df_copy[target_col + '_higher_than_avg'] = (df_copy[target_col] > df_copy[target_col + '_mean']).astype(int)
    
    return df_copy

In [6]:
def add_label_based_on_threshold(df, column_name, threshold, is_higher):
    # 데이터프레임 복사
    df_copy = df.copy()
    
    # 새 라벨 컬럼 이름 설정
    new_column_name = column_name + '_label'
    
    # 새 라벨 컬럼 추가
    if is_higher:
        df_copy[new_column_name] = (df_copy[column_name] > threshold).astype(int)
    else:
        df_copy[new_column_name] = (df_copy[column_name] < threshold).astype(int)
    
    return df_copy


def add_label_based_on_threshold_between(df, column_name, threshold):
    # 데이터프레임 복사
    df_copy = df.copy()
    
    # 새 라벨 컬럼 이름 설정
    new_column_name = column_name + '_label'
    
    # 새 라벨 컬럼 추가
    df_copy[new_column_name] = ((df_copy[column_name] > threshold[0]) & (df_copy[column_name] < threshold[1])).astype(int)
    
    return df_copy

In [7]:
q1_result_df = calculate_mean_and_compare(result_df, 'userId', 'sleep')
q2_result_df = calculate_mean_and_compare(result_df, 'userId', 'pmEmotion')
q3_result_df = calculate_mean_and_compare(result_df, 'userId', 'pmStress')

q1_accuracy = (q1_result_df['sleep_higher_than_avg'] == q1_result_df['Q1']).mean()
q2_accuracy = (q2_result_df['pmEmotion_higher_than_avg'] == q2_result_df['Q2']).mean()
q3_accuracy = (q3_result_df['pmStress_higher_than_avg'] == q3_result_df['Q3']).mean()

print(f'Q1 - sleep 정확도: {q1_accuracy:.2f}')
print(f'Q2 - pmEmotion 정확도: {q2_accuracy:.2f}')
print(f'Q3 - pmStress 정확도: {q3_accuracy:.2f}')

Q1 - sleep 정확도: 0.96
Q2 - pmEmotion 정확도: 0.98
Q3 - pmStress 정확도: 0.95


In [10]:
s1_result_df = add_label_based_on_threshold_between(result_df, 'total_sleep_time', (7*60*60, 9*60*60))
s2_result_df = add_label_based_on_threshold(result_df, 'sleep_efficiency', 0.85 * 100, is_higher=True)
s3_result_df = add_label_based_on_threshold(result_df, 'durationtosleep', 30 * 60, is_higher=False)
s4_result_df = add_label_based_on_threshold(result_df, 'waso', 20 * 60, is_higher=False)

s1_accuracy = (s1_result_df['total_sleep_time_label'] == s1_result_df['S1']).mean()
s2_accuracy = (s2_result_df['sleep_efficiency_label'] == s2_result_df['S2']).mean()
s3_accuracy = (s3_result_df['durationtosleep_label'] == s3_result_df['S3']).mean()
s4_accuracy = (s4_result_df['waso_label'] == s4_result_df['S4']).mean()

print(f'S1 - totalsleeptime 정확도: {s1_accuracy:.2f}')
print(f'S2 - sleepEfficiency 정확도: {s2_accuracy:.2f}')
print(f'S3 - durationtosleep 정확도: {s3_accuracy:.2f}')
print(f'S4 - waso 정확도: {s4_accuracy:.2f}')

S1 - totalsleeptime 정확도: 0.98
S2 - sleepEfficiency 정확도: 0.98
S3 - durationtosleep 정확도: 0.97
S4 - waso 정확도: 0.99


In [None]:
# result_df["y1"] = result_df["sleep"]
# result_df["y2"] = result_df["pmEmotion"]
# result_df["y3"] = result_df["pmStress"]

# result_df["y4"] = result_df["deepsleepduration"] + result_df["lightsleepduration"] + result_df["remsleepduration"]
# result_df["y5"] = result_df["wakeupduration"]
# result_df["y6"] = result_df["durationtosleep"]
# result_df["y7"] = result_df["durationtowakeup"]