In [20]:
#데이터조회
import pandas as pd
act = pd.read_csv('../data/dailyActivity_merged_fin_sum.csv')
act_grade = act.copy()
act_grade.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1373 entries, 0 to 1372
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        1373 non-null   int64  
 1   ActivityDate              1373 non-null   object 
 2   TotalSteps                1373 non-null   int64  
 3   TotalDistance             1373 non-null   float64
 4   TrackerDistance           1373 non-null   float64
 5   LoggedActivitiesDistance  1373 non-null   float64
 6   VeryActiveDistance        1373 non-null   float64
 7   ModeratelyActiveDistance  1373 non-null   float64
 8   LightActiveDistance       1373 non-null   float64
 9   SedentaryActiveDistance   1373 non-null   float64
 10  VeryActiveMinutes         1373 non-null   int64  
 11  FairlyActiveMinutes       1373 non-null   int64  
 12  LightlyActiveMinutes      1373 non-null   int64  
 13  SedentaryMinutes          1373 non-null   int64  
 14  Calories

Id 사용자id 	
ActivityDate 작동일시(일) 	
TotalSteps 총걸음수	
TotalDistance 총거리	
TrackerDistance	기기측정거리
LoggedActivitiesDistance 이용자입력거리 	
VeryActiveDistance 고강도구간(빡센운동)
ModeratelyActiveDistance 중강도구간(가벼운운동)
LightActiveDistance	저강도구간(일상적움직임)
SedentaryActiveDistance 미세강도구간(버그구간)	
VeryActiveMinutes 고강도시간	
FairlyActiveMinutes	중강도시간
LightlyActiveMinutes 저강도시간
SedentaryMinutes 착석시간(움직임없음)
Calories 소비칼로리
           

In [None]:

# 날짜 변환 (String -> Datetime)
act_grade['ActivityDate'] = pd.to_datetime(act_grade['ActivityDate'])
# 요일 변수 추가 (0=월요일, 6=일요일)
act_grade['DayOfWeek'] = act_grade['ActivityDate'].dt.day_name()

# 명목형 변수 변환 (Int -> String)
# 이유: 'Id'는 숫자지만 더하거나 평균을 내는 게 의미 없음. 실수로 계산되지 않도록 문자열(str)로 바꿔줌
act_grade['Id'] = act_grade['Id'].astype(str)

# 숫자 단위 통일 및 정리 (Rounding)
# 이유: Distance 관련 컬럼들이 소수점 13자리까지 있어서 보기 힘듦. 소수점 2자리로 축약
dist_cols = ['TotalDistance', 'TrackerDistance', 'VeryActiveDistance', 
             'ModeratelyActiveDistance', 'LightActiveDistance', 'SedentaryActiveDistance']
act_grade[dist_cols] = act_grade[dist_cols].round(2)

# 논리적 이상치 처리 (Data Consistency)
# 이유: 하루는 24시간(1440분)인데, 다 더해서 1440분이 넘는 데이터는 '오류'. 1440분 초과 데이터를 제거
act_grade['Total_Minutes_Check'] = (act_grade['VeryActiveMinutes'] + 
                              act_grade['FairlyActiveMinutes'] + 
                              act_grade['LightlyActiveMinutes'] + 
                              act_grade['SedentaryMinutes'])

# 1440분 이하인 정상 데이터만 남기기
act_clean = act_grade[act_grade['Total_Minutes_Check'] <= 1440].copy()

# 결과 확인
print("--- 전처리 후 데이터 정보 ---")
act_clean.info()
print("\n--- 상위 5개 데이터 확인 (요일, ID, 소수점 확인) ---")
act_clean[['Id', 'ActivityDate', 'DayOfWeek', 'TotalDistance', 'Total_Minutes_Check']].head()

--- 전처리 후 데이터 정보 ---
<class 'pandas.core.frame.DataFrame'>
Index: 1359 entries, 0 to 1372
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Id                        1359 non-null   object        
 1   ActivityDate              1359 non-null   datetime64[ns]
 2   TotalSteps                1359 non-null   int64         
 3   TotalDistance             1359 non-null   float64       
 4   TrackerDistance           1359 non-null   float64       
 5   LoggedActivitiesDistance  1359 non-null   float64       
 6   VeryActiveDistance        1359 non-null   float64       
 7   ModeratelyActiveDistance  1359 non-null   float64       
 8   LightActiveDistance       1359 non-null   float64       
 9   SedentaryActiveDistance   1359 non-null   float64       
 10  VeryActiveMinutes         1359 non-null   int64         
 11  FairlyActiveMinutes       1359 non-null   int64         
 12  Ligh

Unnamed: 0,Id,ActivityDate,DayOfWeek,TotalDistance,Total_Minutes_Check
0,1503960366,2016-03-25,Friday,7.11,1054
1,1503960366,2016-03-26,Saturday,11.55,968
2,1503960366,2016-03-27,Sunday,8.53,934
3,1503960366,2016-03-28,Monday,8.93,1363
4,1503960366,2016-03-29,Tuesday,7.85,1062


In [None]:

# (1) 총 활동 시간 (Total_Active_Minutes)
act_clean['Total_Active_Minutes'] = (
    act_clean['VeryActiveMinutes'] + 
    act_clean['FairlyActiveMinutes'] + 
    act_clean['LightlyActiveMinutes']
)
# (2) 운동 강도 점수 (Intensity_Score)
act_clean['Intensity_Score'] = (
    (act_clean['VeryActiveMinutes'] * 2) + 
    (act_clean['FairlyActiveMinutes'] * 1.5) + 
    (act_clean['LightlyActiveMinutes'] * 1)
)
# (3) 활동 등급 (Activity_Grade)
act_clean['Activity_Grade'] = pd.qcut(
    act_clean['Intensity_Score'], 
    q=3, 
    labels=['Low', 'Medium', 'High']
)
# 잘 만들어졌는지 확인
act_clean[['Id', 'Total_Active_Minutes', 'Intensity_Score', 'Activity_Grade']].head()
# 등급별로 몇 명이나 있는지 확인
act_clean['Activity_Grade'].value_counts()

Activity_Grade
Low       453
Medium    453
High      453
Name: count, dtype: int64

In [28]:
# 등급별 소비 칼로리 확인
# 'Activity_Grade'로 그룹을 묶고, 'Calories' 컬럼의 통계(평균, 최소, 최대 등)를 확인
calorie_stats = act_clean.groupby('Activity_Grade')['Calories'].describe()
calorie_stats_rounded = calorie_stats.round(2) #소수점 2자리까지 
calorie_stats_rounded

  calorie_stats = act_clean.groupby('Activity_Grade')['Calories'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Activity_Grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Low,453.0,1857.9,578.95,0.0,1469.0,1841.0,2130.0,3728.0
Medium,453.0,2309.98,552.53,1248.0,1851.0,2203.0,2806.0,4010.0
High,453.0,2728.86,768.15,1144.0,2094.0,2576.0,3324.0,5517.0


In [None]:
# '상' 등급이고 활동시간도 높은 사람들 : 헬창
# 조건: 활동 시간이 중앙값보다 길고(>=) AND 등급이 High인 사람
# 정렬: 강도 점수가 높은 순서대로 (내림차순) -> "누가 제일 빡세게 했나?"

time_median = act_clean['Total_Active_Minutes'].median()# 기준점(중앙값) 계산
elite_examples = act_clean[
    (act_clean['Total_Active_Minutes'] >= time_median) & 
    (act_clean['Activity_Grade'] == 'High')
].sort_values(by='Intensity_Score', ascending=False).head()
print("--- '헬창' 케이스 ---")
elite_examples

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,...,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,DayOfWeek,Total_Minutes_Check,Total_Active_Minutes,Intensity_Score,Activity_Grade,Efficiency
425,2891001357,2016-04-01,0,0.0,0.0,4.828032,0.0,0.0,0.0,0.0,...,660,0,780,4562,Friday,1440,660,990.0,High,1.5
426,2891001357,2016-04-02,4126,3.22,0.0,3.218688,0.0,0.0,3.22,0.0,...,0,720,720,2881,Saturday,1440,720,720.0,High,1.0
884,5577150313,2016-05-01,13368,9.99,9.99,0.0,5.31,1.44,3.24,0.0,...,72,178,499,4546,Sunday,943,444,674.0,High,1.518018
1338,8877689391,2016-04-16,29326,25.29,25.29,0.0,13.24,1.21,10.71,0.0,...,29,429,888,4547,Saturday,1440,552,660.5,High,1.196558
870,5577150313,2016-04-24,15764,11.78,11.78,0.0,7.65,2.15,1.98,0.0,...,65,141,425,4392,Sunday,841,416,658.5,High,1.582933


In [42]:
# '하' 등급이고 활동시간도 낮은 사람들 : 깔짝충
# 조건: 활동 시간이 중앙값보다 짧고(<) AND 등급이 Low인 사람
# 정렬: 활동 시간이 적은 순서대로 (오름차순) -> "누가 제일 안 움직였나?"
ggal_examples = act_clean[
    (act_clean['Total_Active_Minutes'] < time_median) & 
    (act_clean['Activity_Grade'] == 'Low')
].sort_values(by='Total_Active_Minutes', ascending=True).head()
print("--- '깔짝충' 케이스 ---")
ggal_examples

--- '깔짝충' 케이스 ---


Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,...,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,DayOfWeek,Total_Minutes_Check,Total_Active_Minutes,Intensity_Score,Activity_Grade,Efficiency
40,1503960366,2016-05-12,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1440,0,Thursday,1440,0,0.0,Low,0.0
1330,8792009665,2016-05-09,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1440,1688,Monday,1440,0,0.0,Low,0.0
1285,8583815059,2016-05-04,3588,2.8,2.8,0.0,0.0,0.0,0.0,0.0,...,0,0,1440,2516,Wednesday,1440,0,0.0,Low,0.0
1299,8792009665,2016-04-17,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1440,1688,Sunday,1440,0,0.0,Low,0.0
1301,8792009665,2016-04-19,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1440,1688,Tuesday,1440,0,0.0,Low,0.0


In [None]:
# '상' 등급이지만 활동시간은 낮은 사람들 : 고효율
short_but_high = act_clean[act_clean['Activity_Grade'] == 'High'].sort_values(by='Total_Active_Minutes').head()

print("--- '고효율' 케이스 ---")
short_but_high

--- '고효율' 케이스 ---


Unnamed: 0,Id,Total_Active_Minutes,VeryActiveMinutes,Intensity_Score
1175,8053475328,232,93,329.5
857,5577150313,242,86,336.0
864,5577150313,242,108,361.5
1109,7086361926,245,87,338.5
1150,8053475328,252,89,343.5


In [None]:
# '하' 등급이지만 활동시간은 높은 사람들 : 저효율
long_but_low = act_clean[act_clean['Activity_Grade'] == 'Low'].sort_values(by='Total_Active_Minutes', ascending=False).head()

print("\n--- '저효율' 케이스 ---")
long_but_low


--- '저효율' 케이스 ---


Unnamed: 0,Id,Total_Active_Minutes,VeryActiveMinutes,Intensity_Score
270,2026352035,188,0,188.0
518,4020332650,187,0,187.0
69,1624580081,186,0,186.0
290,2026352035,185,0,185.0
293,2026352035,185,0,185.0


In [None]:
import numpy as np
# 효율성 지표(Efficiency) 만들기
# 공식: 운동 강도 점수(Intensity_Score) / 총 활동 시간(Total_Active_Minutes)
# 의미: 1분 움직일 때마다 몇 점이나 따냈나 (높을수록 고강도 운동!)
# 주의: 활동 시간이 0분인 경우 에러가 나지 않도록 np.where로 예외 처리
act_clean['Efficiency'] = np.where(
    act_clean['Total_Active_Minutes'] > 0, 
    act_clean['Intensity_Score'] / act_clean['Total_Active_Minutes'], 
    0
)