In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

In [2]:
def load_and_preprocess_data(filepath):
    act = pd.read_csv("../data/dailyActivity_merged_fin_sum.csv")
    
    act['ActivityDate'] = pd.to_datetime(act['ActivityDate'])
    
    # Mile -> Km 변환
    distance_cols = [col for col in act.columns if 'Distance' in col]
    act[distance_cols] = (act[distance_cols] * 1.60934).round(2)
        
    act['Id'] = act['Id'].astype(str)
    
    return act

act = load_and_preprocess_data("../data/dailyActivity_merged_fin_sum.csv")


In [3]:
def remove_non_wear_days(df):
    non_wear = (df['TotalSteps'] == 0) & \
               (df['TotalDistance'] == 0) & \
               (df['SedentaryMinutes'] >= 1380)
    
    print(f"미착용일 제거: {non_wear.sum()}개")
    return df[~non_wear].copy()

act = remove_non_wear_days(act)

미착용일 제거: 124개


In [4]:
def create_derived_features(df):
    df = df.copy()
    
    df['weekday'] = df['ActivityDate'].dt.day_name()
    df['is_weekend'] = df['weekday'].isin(['Saturday', 'Sunday'])
    
    df['TotalActiveMinutes'] = (
        df['VeryActiveMinutes'] + 
        df['FairlyActiveMinutes'] + 
        df['LightlyActiveMinutes']
    )
    
    df['SedentaryRatio'] = df['SedentaryMinutes'] / 1440
    
    df['Intensity_Score'] = (
        (df['VeryActiveMinutes'] * 2) + 
        (df['FairlyActiveMinutes'] * 1.5) + 
        (df['LightlyActiveMinutes'] * 1)
    )
    
    df['Efficiency'] = np.where(
        df['TotalActiveMinutes'] > 0,
        df['Intensity_Score'] / df['TotalActiveMinutes'],
        0
    )
    
    df['CaloriesPerKm'] = np.where(
        df['TotalDistance'] > 0,
        df['Calories'] / df['TotalDistance'],
        np.nan
    )
    
    return df

act = create_derived_features(act)

In [5]:
def classify_day_type(df):
    df = df.copy()
    
    conditions = [
        (df['TotalSteps'] >= 7000) | (df['TotalActiveMinutes'] >= 60),
        (df['SedentaryRatio'] >= 0.75) & (df['TotalSteps'] > 0),
        (df['TotalSteps'] < 3000) & (df['SedentaryRatio'] >= 0.50) & (df['TotalSteps'] > 0)
    ]
    
    choices = ['Active Day', 'Over-Sedentary Day', 'Low Engagement Day']
    df['DayType'] = np.select(conditions, choices, default='Normal Day')
    
    return df

act = classify_day_type(act)


In [6]:
def create_calorie_groups(df):
    df = df.copy()
    
    bins = [1000, 1500, 2000, 2500, float('inf')]
    labels = ['1000-1500', '1500-2000', '2000-2500', '2500+']
    df['CalorieGroup'] = pd.cut(df['Calories'], bins=bins, labels=labels, right=False)
    
    return df

act = create_calorie_groups(act)

In [7]:
from typing import Optional

def filter_date_range(df: pd.DataFrame, 
                      start_date: str = "2016-04-01",
                      end_date: Optional[str] = None) -> pd.DataFrame:

    start_date = pd.to_datetime(start_date)
        
    if end_date is None:
        end_date = df['ActivityDate'].max()
    else:
        end_date = pd.to_datetime(end_date)
    
    filtered = df[
        (df['ActivityDate'] >= start_date) & 
        (df['ActivityDate'] <= end_date)
    ].copy()
    
    print(f"날짜 필터링: {start_date.date()} ~ {end_date.date()}")
    print(f"  데이터 행 수: {len(filtered)}")
    
    return filtered

act = filter_date_range(act)

날짜 필터링: 2016-04-01 ~ 2016-05-12
  데이터 행 수: 1189


In [8]:
from typing import List

def remove_outliers_iqr(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame: # IQR 방법으로 이상치 제거
  
    df_clean = df.copy()
    
    print("\n=== IQR 이상치 제거 ===")
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        before_count = len(df_clean)
        df_clean = df_clean[
            (df_clean[col] >= lower_bound) & 
            (df_clean[col] <= upper_bound)
        ]
        after_count = len(df_clean)
        
        if before_count != after_count:
            print(f"{col}: {before_count - after_count}개 제거")
    
    return df_clean
outlier_cols = ['TotalSteps', 'TotalDistance', 'TrackerDistance',
                'LoggedActivitiesDistance', 'VeryActiveDistance',
                'ModeratelyActiveDistance', 'LightActiveDistance',
                'VeryActiveMinutes','FairlyActiveMinutes', 
                'LightlyActiveMinutes', 
                'SedentaryMinutes', 'Calories','CaloriesPerKm']

act = remove_outliers_iqr(act, outlier_cols)


=== IQR 이상치 제거 ===
TotalSteps: 19개 제거
TotalDistance: 16개 제거
TrackerDistance: 1개 제거
LoggedActivitiesDistance: 52개 제거
VeryActiveDistance: 74개 제거
ModeratelyActiveDistance: 64개 제거
LightActiveDistance: 6개 제거
VeryActiveMinutes: 62개 제거
FairlyActiveMinutes: 30개 제거
LightlyActiveMinutes: 3개 제거
SedentaryMinutes: 4개 제거
Calories: 8개 제거
CaloriesPerKm: 100개 제거


In [9]:
def calc_max_streak(dates):
    # 어떤 타입이 들어와도 Series로 강제
    s = pd.to_datetime(pd.Series(dates), errors="coerce").dropna()
    
    # 날짜만 남기고, 중복 제거, 정렬
    s = s.dt.normalize().drop_duplicates().sort_values()
    
    if s.empty:
        return 0

    diffs = s.diff().dt.days.fillna(999).astype(int)

    max_streak = 1
    current = 1

    for d in diffs.iloc[1:]:
        if d == 1:
            current += 1
            max_streak = max(max_streak, current)
        else:
            current = 1

    return max_streak

act = act.drop(columns=["streak_days"], errors="ignore")

streak_df = (
    act.groupby("Id")["ActivityDate"]
       .apply(calc_max_streak)
       .reset_index(name="streak_days")
)

act = act.merge(streak_df, on="Id", how="left")

In [10]:
act.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,...,weekday,is_weekend,TotalActiveMinutes,SedentaryRatio,Intensity_Score,Efficiency,CaloriesPerKm,DayType,CalorieGroup,streak_days
0,1503960366,2016-04-01,12262,12.67,12.67,0.0,5.34,1.34,5.86,0.0,...,Friday,False,268,0.601389,325.5,1.214552,147.434886,Active Day,1500-2000,21
1,1503960366,2016-04-10,10057,11.23,11.23,0.0,6.44,0.79,3.99,0.0,...,Sunday,True,225,0.511806,275.5,1.224444,156.277827,Active Day,1500-2000,21
2,1503960366,2016-04-11,10990,11.68,11.68,0.0,3.28,0.92,7.48,0.0,...,Monday,False,256,0.59375,289.0,1.128906,155.05137,Active Day,1500-2000,21
3,1503960366,2016-04-12,13386,13.9,13.9,0.0,3.03,0.89,9.96,0.0,...,Tuesday,False,375,0.527778,406.5,1.084,146.402878,Active Day,2000-2500,21
4,1503960366,2016-04-13,10735,11.22,11.22,0.0,2.53,1.11,7.58,0.0,...,Wednesday,False,257,0.538889,287.5,1.118677,160.160428,Active Day,1500-2000,21


In [11]:
act.dtypes

Id                                     str
ActivityDate                datetime64[us]
TotalSteps                           int64
TotalDistance                      float64
TrackerDistance                    float64
LoggedActivitiesDistance           float64
VeryActiveDistance                 float64
ModeratelyActiveDistance           float64
LightActiveDistance                float64
SedentaryActiveDistance            float64
VeryActiveMinutes                    int64
FairlyActiveMinutes                  int64
LightlyActiveMinutes                 int64
SedentaryMinutes                     int64
Calories                             int64
weekday                                str
is_weekend                            bool
TotalActiveMinutes                   int64
SedentaryRatio                     float64
Intensity_Score                    float64
Efficiency                         float64
CaloriesPerKm                      float64
DayType                                str
CalorieGrou

In [12]:
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, Any, List



ACTIVITY_LEVELS = {
    "low": (0, 1500),
    "moderate": (1500, 2000),
    "active": (2000, 2500),
    "high": (2500, 999999),
}

PERSONA_TYPES = {
    "newbie": {"name": "입문자형", "level": "low", "desc": "막 시작한 사람"},
    "beginner": {"name": "초보자형", "level": "moderate", "stable": False, "desc": "운동 수준 낮고 습관화 안됨"},
    "turtle": {"name": "거북이형", "level": "moderate", "stable": True, "desc": "습관은 있는데 강도 낮음"},
    "burst": {"name": "벼락치기형", "level": "active", "stable": False, "desc": "운동은 잘하는데 자주 안함"},
    "ideal": {"name": "모범생형", "level": "active", "stable": True, "desc": "이상적 타입"},
    "lazy_genius": {"name": "게으른 천재형", "level": "high", "stable": False, "desc": "수준은 높은데 횟수 적음"},
    "veteran": {"name": "고인물형", "level": "high", "stable": True, "desc": "고수. 부상주의"},
}

In [13]:
@dataclass
class PersonaInfo:
    user_id: str
    persona_key: str
    persona_name: str
    level: str
    desc: str
    metrics: Dict[str, Any]
    short: str = ""
    medium: str = ""
    long: str = ""
    programs: List[str] = field(default_factory=list)

    def get_summary(self) -> str:
        m = self.metrics
        return f"""
{'='*50}
사용자 ID: {self.user_id}
페르소나: {self.persona_name} ({self.persona_key})
레벨: {self.level}
설명: {self.desc}

[이번 달 활동]
- 총 걸음수: {m.get('total_steps', 0):,} 보
- 총 거리: {m.get('total_distance', 0):.1f} km
- 총 칼로리: {m.get('total_calories', 0):,} kcal
- 활동일수: {m.get('active_days', 0)}일
- 최대 연속: {m.get('max_streak', 0)}일

[목표]
- 1주: {self.short}
- 1개월: {self.medium}
- 3개월: {self.long}

[추천 프로그램]
{"\n".join(f"• {p}" for p in self.programs)}
{'='*50}
"""




In [14]:
class PersonaClassifier:
    def __init__(self, avg_active_days: int = 32):
        self.avg_active_days = avg_active_days

    # -------------------------
    # 빈 데이터에서도 에러 없이 평균, 합 계산
    # -------------------------
    def _safe_mean(self, s):
        return float(s.mean()) if s is not None and len(s) > 0 else 0.0

    def _safe_sum(self, s):
        return float(s.sum()) if s is not None and len(s) > 0 else 0.0

    # -------------------------
    # 사용자 지표
    # -------------------------
    def get_user_metrics(self, df, user_id):
        user_data = df[df["Id"].astype(str) == str(user_id)].copy()

        if len(user_data) == 0:
            return {
                "avg_calories": 0,
                "avg_steps": 0,
                "total_calories": 0,
                "total_steps": 0,
                "total_distance": 0,
                "active_days": 0,
                "max_streak": 0,
            }

        calories = user_data["Calories"] if "Calories" in user_data.columns else None
        steps = user_data["TotalSteps"] if "TotalSteps" in user_data.columns else None
        dist = user_data["TotalDistance"] if "TotalDistance" in user_data.columns else None

        max_streak = 0
        if "streak_days" in user_data.columns:
            max_streak = int(pd.to_numeric(user_data["streak_days"], errors="coerce").max() or 0)
        # 활동일수 (고유 날짜 수)
        active_days = 0
        if "ActivityDate" in user_data.columns and "DayType" in user_data.columns:
            active_days = (
            user_data[user_data["DayType"] == "Active Day"]["ActivityDate"]
            .nunique()
        )



        return {
            "avg_calories": self._safe_mean(calories),
            "avg_steps": self._safe_mean(steps),
            "total_calories": self._safe_sum(calories),
            "total_steps": self._safe_sum(steps),
            "total_distance": self._safe_sum(dist),
            "active_days": active_days,
            "max_streak": max_streak
        }

    # -------------------------
    # 활동 레벨
    # -------------------------
    def get_activity_level(self, avg_calories):
        if avg_calories < 1500:
            return "low"
        elif avg_calories < 2000:
            return "moderate"
        elif avg_calories < 2500:
            return "active"
        else:
            return "high"

    def is_stable(self, active_days):
        return active_days >= self.avg_active_days

    # -------------------------
    # 페르소나 결정
    # -------------------------
    def get_persona(self, metrics):
        level = self.get_activity_level(metrics["avg_calories"])
        stable = self.is_stable(metrics["active_days"])

        if level == "low":
            return "newbie"
        elif level == "moderate":
            return "turtle" if stable else "beginner"
        elif level == "active":
            return "ideal" if stable else "burst"
        else:
            return "veteran" if stable else "lazy_genius"

    # -------------------------
    # 목표
    # -------------------------
    def get_goals(self, persona_key):
        goals = {
            "newbie": {
                "short": "하루 3,000보 달성 주 3회",
                "medium": "1500 kcal 그룹 진입",
                "long": "Beginner로 성장",
                "programs": ["3분 스트레칭", "출퇴근길 걷기"]
            },
            "beginner": {
                "short": "주 3회 운동",
                "medium": "일 평균 5,000보",
                "long": "Turtle 또는 Burst",
                "programs": ["걷기 챌린지", "홈트"]
            },
            "turtle": {
                "short": "고강도 1회 추가",
                "medium": "2000 kcal 진입",
                "long": "Ideal",
                "programs": ["인터벌", "계단"]
            },
            "burst": {
                "short": "주 5회 운동",
                "medium": "일정 고정",
                "long": "Ideal",
                "programs": ["습관 챌린지"]
            },
            "ideal": {
                "short": "현 수준 유지",
                "medium": "운동 다양화",
                "long": "Veteran",
                "programs": ["크로스핏", "요가"]
            },
            "lazy_genius": {
                "short": "빈도 증가",
                "medium": "연속 7일",
                "long": "Veteran",
                "programs": ["알림"]
            },
            "veteran": {
                "short": "부상 예방",
                "medium": "회복 관리",
                "long": "전문화",
                "programs": ["스트레칭", "코칭"]
            }
        }
        return goals.get(persona_key, goals["newbie"])
    
    def tweak_goals_by_streak(self, goals, metrics): #연속 일수 기준 목표 세분화
        a = metrics.get("active_days", 0)
        s = metrics.get("max_streak", 0)

        if a == 0:
            return goals

        if s >= 10:
            goals["short"] = "연속 기록 유지 + 하루 10분 스트레칭"
            goals["programs"] = list(dict.fromkeys(["회복 스트레칭"] + goals["programs"]))

        elif s <= 2 and a >= 10:
            goals["short"] = "연속 3일 만들기 (가벼운 루틴부터)"
            goals["programs"] = list(dict.fromkeys(["알림 설정", "산책 루틴"] + goals["programs"]))

        return goals

    # -------------------------
    # 핵심 분석 함수
    # -------------------------
    def analyze_user(self, df, user_id):
        metrics = self.get_user_metrics(df, user_id)
        persona_key = self.get_persona(metrics)
        goals = self.get_goals(persona_key)

        return {
            "user_id": user_id,
            "persona_key": persona_key,
            "metrics": metrics,
            **goals
        }

    # -------------------------
    # 리포트
    # -------------------------
    def generate_report(self, result):
        m = result["metrics"]

        report = f"""
===============================
 BellaBuddy 월간 리포트
===============================

ID : {result["user_id"]}
페르소나: {result["persona_key"]}

총 걸음수: {int(m["total_steps"]):,} 보
총 거리: {m["total_distance"]:.1f} km
총 칼로리: {int(m["total_calories"]):,} kcal
활동일수: {m["active_days"]}일
최대 연속: {m["max_streak"]}일

[목표]
1주: {result["short"]}
1개월: {result["medium"]}
3개월: {result["long"]}

[추천 프로그램]
"""
        for p in result["programs"]:
            report += f"- {p}\n"

        return report


In [15]:
def analyze_single_user(df, user_id):
    classifier = PersonaClassifier()
    result = classifier.analyze_user(df, user_id)
    report = classifier.generate_report(result)
    print(report)
    return result


In [16]:
analyze_single_user(act, "1624580081")



 BellaBuddy 월간 리포트

ID : 1624580081
페르소나: newbie

총 걸음수: 169,879 보
총 거리: 179.3 km
총 칼로리: 50,641 kcal
활동일수: 35일
최대 연속: 15일

[목표]
1주: 하루 3,000보 달성 주 3회
1개월: 1500 kcal 그룹 진입
3개월: Beginner로 성장

[추천 프로그램]
- 3분 스트레칭
- 출퇴근길 걷기



{'user_id': '1624580081',
 'persona_key': 'newbie',
 'metrics': {'avg_calories': 1446.8857142857144,
  'avg_steps': 4853.685714285714,
  'total_calories': 50641.0,
  'total_steps': 169879.0,
  'total_distance': 179.29,
  'active_days': 35,
  'max_streak': 15},
 'short': '하루 3,000보 달성 주 3회',
 'medium': '1500 kcal 그룹 진입',
 'long': 'Beginner로 성장',
 'programs': ['3분 스트레칭', '출퇴근길 걷기']}

In [17]:
ids = sorted(act["Id"].astype(str).unique().tolist())
ids_df = pd.DataFrame(ids, columns=["Id"])
ids_df.head()

Unnamed: 0,Id
0,1503960366
1,1624580081
2,1644430081
3,1844505072
4,1927972279


In [19]:
#전체 일자 vs 'day type'가 "active day"일때 데이터 갯수 비교 
check = (
    act.groupby("Id")
       .apply(lambda x: pd.Series({
           "days_all": x["ActivityDate"].nunique(),
           "days_active": x[x["DayType"]=="Active Day"]["ActivityDate"].nunique()
       }))
       .reset_index()
)

check["diff"] = check["days_all"] - check["days_active"]

check[["days_all","days_active","diff"]].describe()


Unnamed: 0,days_all,days_active,diff
count,33.0,33.0,33.0
mean,22.727273,22.424242,0.30303
std,10.54832,10.650793,0.809508
min,3.0,3.0,0.0
25%,14.0,14.0,0.0
50%,24.0,22.0,0.0
75%,30.0,30.0,0.0
max,40.0,40.0,3.0
