In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

In [65]:
def load_and_preprocess_data(filepath: str) -> pd.DataFrame:
    act = pd.read_csv(filepath)  # Î∞õÏùÄ Ïù∏Ïûê ÏÇ¨Ïö©!
    
    act['ActivityDate'] = pd.to_datetime(act['ActivityDate'])
    
    distance_cols = [col for col in act.columns if 'Distance' in col]
    act[distance_cols] = (act[distance_cols] * 1.60934).round(2)
    
    act['Id'] = act['Id'].astype(str)
    
    return act

# ÏÇ¨Ïö©
act = load_and_preprocess_data("../data/dailyActivity_merged_fin_sum.csv")

In [66]:
def remove_non_wear_days(df):
    non_wear = (df['TotalSteps'] == 0) & \
               (df['TotalDistance'] == 0) & \
               (df['SedentaryMinutes'] >= 1380)
    
    print(f"ÎØ∏Ï∞©Ïö©Ïùº Ï†úÍ±∞: {non_wear.sum()}Í∞ú")
    return df[~non_wear].copy()

act = remove_non_wear_days(act)

ÎØ∏Ï∞©Ïö©Ïùº Ï†úÍ±∞: 124Í∞ú


In [67]:
def create_derived_features(df):
    df = df.copy()
    
    df['weekday'] = df['ActivityDate'].dt.day_name()
    df['is_weekend'] = df['weekday'].isin(['Saturday', 'Sunday'])
    
    df['TotalActiveMinutes'] = (
        df['VeryActiveMinutes'] + 
        df['FairlyActiveMinutes'] + 
        df['LightlyActiveMinutes']
    )
    
    df['SedentaryRatio'] = df['SedentaryMinutes'] / 1440
    
    df['Intensity_Score'] = (
        (df['VeryActiveMinutes'] * 2) + 
        (df['FairlyActiveMinutes'] * 1.5) + 
        (df['LightlyActiveMinutes'] * 1)
    )
    
    df['Efficiency'] = np.where(
        df['TotalActiveMinutes'] > 0,
        df['Intensity_Score'] / df['TotalActiveMinutes'],
        0
    )
    
    df['CaloriesPerKm'] = np.where(
        df['TotalDistance'] > 0,
        df['Calories'] / df['TotalDistance'],
        np.nan
    )
    
    return df

act = create_derived_features(act)

In [68]:
def classify_day_type(df):
    df = df.copy()
    
    conditions = [
        (df['TotalSteps'] >= 7000) | (df['TotalActiveMinutes'] >= 60),
        (df['SedentaryRatio'] >= 0.75) & (df['TotalSteps'] > 0),
        (df['TotalSteps'] < 3000) & (df['SedentaryRatio'] >= 0.50) & (df['TotalSteps'] > 0)
    ]
    
    choices = ['Active Day', 'Over-Sedentary Day', 'Low Engagement Day']
    df['DayType'] = np.select(conditions, choices, default='Normal Day')
    
    return df

act = classify_day_type(act)


In [69]:
def create_calorie_groups(df):
    df = df.copy()
    
    bins = [1000, 1500, 2000, 2500, float('inf')]
    labels = ['1000-1500', '1500-2000', '2000-2500', '2500+']
    df['CalorieGroup'] = pd.cut(df['Calories'], bins=bins, labels=labels, right=False)
    
    return df

act = create_calorie_groups(act)

In [70]:
from typing import Optional

def filter_date_range(df: pd.DataFrame, 
                      start_date: str = "2016-04-01",
                      end_date: Optional[str] = None) -> pd.DataFrame:

    start_date = pd.to_datetime(start_date)
        
    if end_date is None:
        end_date = df['ActivityDate'].max()
    else:
        end_date = pd.to_datetime(end_date)
    
    filtered = df[
        (df['ActivityDate'] >= start_date) & 
        (df['ActivityDate'] <= end_date)
    ].copy()
    
    print(f"ÎÇ†Ïßú ÌïÑÌÑ∞ÎßÅ: {start_date.date()} ~ {end_date.date()}")
    print(f"  Îç∞Ïù¥ÌÑ∞ Ìñâ Ïàò: {len(filtered)}")
    
    return filtered

act = filter_date_range(act)

ÎÇ†Ïßú ÌïÑÌÑ∞ÎßÅ: 2016-04-01 ~ 2016-05-12
  Îç∞Ïù¥ÌÑ∞ Ìñâ Ïàò: 1189


In [71]:
from typing import List

def remove_outliers_iqr(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame: # IQR Î∞©Î≤ïÏúºÎ°ú Ïù¥ÏÉÅÏπò Ï†úÍ±∞
  
    df_clean = df.copy()
    
    print("\n=== IQR Ïù¥ÏÉÅÏπò Ï†úÍ±∞ ===")
    for col in columns:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        before_count = len(df_clean)
        df_clean = df_clean[
            (df_clean[col] >= lower_bound) & 
            (df_clean[col] <= upper_bound)
        ]
        after_count = len(df_clean)
        
        if before_count != after_count:
            print(f"{col}: {before_count - after_count}Í∞ú Ï†úÍ±∞")
    
    return df_clean
outlier_cols = ['TotalSteps', 'TotalDistance', 'TrackerDistance',
                'LoggedActivitiesDistance', 'VeryActiveDistance',
                'ModeratelyActiveDistance', 'LightActiveDistance',
                'VeryActiveMinutes','FairlyActiveMinutes', 
                'LightlyActiveMinutes', 
                'SedentaryMinutes', 'Calories','CaloriesPerKm']

act = remove_outliers_iqr(act, outlier_cols)


=== IQR Ïù¥ÏÉÅÏπò Ï†úÍ±∞ ===
TotalSteps: 19Í∞ú Ï†úÍ±∞
TotalDistance: 16Í∞ú Ï†úÍ±∞
TrackerDistance: 1Í∞ú Ï†úÍ±∞
LoggedActivitiesDistance: 52Í∞ú Ï†úÍ±∞
VeryActiveDistance: 74Í∞ú Ï†úÍ±∞
ModeratelyActiveDistance: 64Í∞ú Ï†úÍ±∞
LightActiveDistance: 6Í∞ú Ï†úÍ±∞
VeryActiveMinutes: 62Í∞ú Ï†úÍ±∞
FairlyActiveMinutes: 30Í∞ú Ï†úÍ±∞
LightlyActiveMinutes: 3Í∞ú Ï†úÍ±∞
SedentaryMinutes: 4Í∞ú Ï†úÍ±∞
Calories: 8Í∞ú Ï†úÍ±∞
CaloriesPerKm: 100Í∞ú Ï†úÍ±∞


In [72]:
def calc_max_streak(dates):
    # Ïñ¥Îñ§ ÌÉÄÏûÖÏù¥ Îì§Ïñ¥ÏôÄÎèÑ SeriesÎ°ú Í∞ïÏ†ú
    s = pd.to_datetime(pd.Series(dates), errors="coerce").dropna()
    
    # ÎÇ†ÏßúÎßå ÎÇ®Í∏∞Í≥†, Ï§ëÎ≥µ Ï†úÍ±∞, Ï†ïÎ†¨
    s = s.dt.normalize().drop_duplicates().sort_values()
    
    if s.empty:
        return 0

    diffs = s.diff().dt.days.fillna(999).astype(int)

    max_streak = 1
    current = 1

    for d in diffs.iloc[1:]:
        if d == 1:
            current += 1
            max_streak = max(max_streak, current)
        else:
            current = 1

    return max_streak

act = act.drop(columns=["streak_days"], errors="ignore")

streak_df = (
    act.groupby("Id")["ActivityDate"]
       .apply(calc_max_streak)
       .reset_index(name="streak_days")
)

act = act.merge(streak_df, on="Id", how="left")

In [73]:
act.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,...,weekday,is_weekend,TotalActiveMinutes,SedentaryRatio,Intensity_Score,Efficiency,CaloriesPerKm,DayType,CalorieGroup,streak_days
0,1503960366,2016-04-01,12262,12.67,12.67,0.0,5.34,1.34,5.86,0.0,...,Friday,False,268,0.601389,325.5,1.214552,147.434886,Active Day,1500-2000,21
1,1503960366,2016-04-10,10057,11.23,11.23,0.0,6.44,0.79,3.99,0.0,...,Sunday,True,225,0.511806,275.5,1.224444,156.277827,Active Day,1500-2000,21
2,1503960366,2016-04-11,10990,11.68,11.68,0.0,3.28,0.92,7.48,0.0,...,Monday,False,256,0.59375,289.0,1.128906,155.05137,Active Day,1500-2000,21
3,1503960366,2016-04-12,13386,13.9,13.9,0.0,3.03,0.89,9.96,0.0,...,Tuesday,False,375,0.527778,406.5,1.084,146.402878,Active Day,2000-2500,21
4,1503960366,2016-04-13,10735,11.22,11.22,0.0,2.53,1.11,7.58,0.0,...,Wednesday,False,257,0.538889,287.5,1.118677,160.160428,Active Day,1500-2000,21


In [74]:
act.dtypes

Id                                     str
ActivityDate                datetime64[us]
TotalSteps                           int64
TotalDistance                      float64
TrackerDistance                    float64
LoggedActivitiesDistance           float64
VeryActiveDistance                 float64
ModeratelyActiveDistance           float64
LightActiveDistance                float64
SedentaryActiveDistance            float64
VeryActiveMinutes                    int64
FairlyActiveMinutes                  int64
LightlyActiveMinutes                 int64
SedentaryMinutes                     int64
Calories                             int64
weekday                                str
is_weekend                            bool
TotalActiveMinutes                   int64
SedentaryRatio                     float64
Intensity_Score                    float64
Efficiency                         float64
CaloriesPerKm                      float64
DayType                                str
CalorieGrou

In [75]:
act['CalorieGroup'].value_counts().sort_index()

CalorieGroup
1000-1500     66
1500-2000    250
2000-2500    229
2500+        203
Name: count, dtype: int64

In [76]:
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, Any, List



ACTIVITY_LEVELS = {
    "low": (0, 1500),
    "moderate": (1500, 2000),
    "active": (2000, 2500),
    "high": (2500, 999999),
}

PERSONA_TYPES = {
    "newbie": {"name": "ÏûÖÎ¨∏ÏûêÌòï", "level": "low", "desc": "Îßâ ÏãúÏûëÌïú ÏÇ¨Îûå"},
    "beginner": {"name": "Ï¥àÎ≥¥ÏûêÌòï", "level": "moderate", "stable": False, "desc": "Ïö¥Îèô ÏàòÏ§Ä ÎÇÆÍ≥† ÏäµÍ¥ÄÌôî ÏïàÎê®"},
    "turtle": {"name": "Í±∞Î∂ÅÏù¥Ìòï", "level": "moderate", "stable": True, "desc": "ÏäµÍ¥ÄÏùÄ ÏûàÎäîÎç∞ Í∞ïÎèÑ ÎÇÆÏùå"},
    "burst": {"name": "Î≤ºÎùΩÏπòÍ∏∞Ìòï", "level": "active", "stable": False, "desc": "Ïö¥ÎèôÏùÄ ÏûòÌïòÎäîÎç∞ ÏûêÏ£º ÏïàÌï®"},
    "ideal": {"name": "Î™®Î≤îÏÉùÌòï", "level": "active", "stable": True, "desc": "Ïù¥ÏÉÅÏ†Å ÌÉÄÏûÖ"},
    "lazy_genius": {"name": "Í≤åÏúºÎ•∏ Ï≤úÏû¨Ìòï", "level": "high", "stable": False, "desc": "ÏàòÏ§ÄÏùÄ ÎÜíÏùÄÎç∞ ÌöüÏàò Ï†ÅÏùå"},
    "veteran": {"name": "Í≥†Ïù∏Î¨ºÌòï", "level": "high", "stable": True, "desc": "Í≥†Ïàò. Î∂ÄÏÉÅÏ£ºÏùò"},
}

In [77]:
@dataclass
class PersonaInfo:
    user_id: str
    persona_key: str
    persona_name: str
    level: str
    desc: str
    metrics: Dict[str, Any]
    short: str = ""
    medium: str = ""
    long: str = ""
    programs: List[str] = field(default_factory=list)

    def get_summary(self) -> str:
        m = self.metrics
        return f"""
{'='*50}
ÏÇ¨Ïö©Ïûê ID: {self.user_id}
ÌéòÎ•¥ÏÜåÎÇò: {self.persona_name} ({self.persona_key})
Î†àÎ≤®: {self.level}
ÏÑ§Î™Ö: {self.desc}

[Ïù¥Î≤à Îã¨ ÌôúÎèô]
- Ï¥ù Í±∏ÏùåÏàò: {m.get('total_steps', 0):,} Î≥¥
- Ï¥ù Í±∞Î¶¨: {m.get('total_distance', 0):.1f} km
- Ï¥ù ÏπºÎ°úÎ¶¨: {m.get('total_calories', 0):,} kcal
- ÌôúÎèôÏùºÏàò: {m.get('active_days', 0)}Ïùº
- ÏµúÎåÄ Ïó∞ÏÜç: {m.get('max_streak', 0)}Ïùº

[Î™©Ìëú]
- 1Ï£º: {self.short}
- 1Í∞úÏõî: {self.medium}
- 3Í∞úÏõî: {self.long}

[Ï∂îÏ≤ú ÌîÑÎ°úÍ∑∏Îû®]
{"\n".join(f"‚Ä¢ {p}" for p in self.programs)}
{'='*50}
"""




In [78]:
class PersonaClassifier:
    def __init__(self, avg_active_days: int = 32):
        self.avg_active_days = avg_active_days

    # -------------------------
    # Îπà Îç∞Ïù¥ÌÑ∞ÏóêÏÑúÎèÑ ÏóêÎü¨ ÏóÜÏù¥ ÌèâÍ∑†, Ìï© Í≥ÑÏÇ∞
    # -------------------------
    def _safe_mean(self, s):
        return float(s.mean()) if s is not None and len(s) > 0 else 0.0

    def _safe_sum(self, s):
        return float(s.sum()) if s is not None and len(s) > 0 else 0.0

    # -------------------------
    # ÏÇ¨Ïö©Ïûê ÏßÄÌëú
    # -------------------------
    def get_user_metrics(self, df, user_id):
        user_data = df[df["Id"].astype(str) == str(user_id)].copy()

        if len(user_data) == 0:
            return {
                "avg_calories": 0,
                "avg_steps": 0,
                "total_calories": 0,
                "total_steps": 0,
                "total_distance": 0,
                "active_days": 0,
                "max_streak": 0,
            }

        calories = user_data["Calories"] if "Calories" in user_data.columns else None
        steps = user_data["TotalSteps"] if "TotalSteps" in user_data.columns else None
        dist = user_data["TotalDistance"] if "TotalDistance" in user_data.columns else None

        max_streak = 0
        if "streak_days" in user_data.columns:
            max_streak = int(pd.to_numeric(user_data["streak_days"], errors="coerce").max() or 0)
        # ÌôúÎèôÏùºÏàò (Í≥†Ïú† ÎÇ†Ïßú Ïàò)
        active_days = 0
        if "ActivityDate" in user_data.columns and "DayType" in user_data.columns:
            active_days = (
            user_data[user_data["DayType"] == "Active Day"]["ActivityDate"]
            .nunique()
        )



        return {
            "avg_calories": self._safe_mean(calories),
            "avg_steps": self._safe_mean(steps),
            "total_calories": self._safe_sum(calories),
            "total_steps": self._safe_sum(steps),
            "total_distance": self._safe_sum(dist),
            "active_days": active_days,
            "max_streak": max_streak
        }

    # -------------------------
    # ÌôúÎèô Î†àÎ≤®
    # -------------------------
    def get_activity_level(self, avg_calories):
        if avg_calories < 1500:
            return "low"
        elif avg_calories < 2000:
            return "moderate"
        elif avg_calories < 2500:
            return "active"
        else:
            return "high"
    
    def is_stable(self, active_days):  
        return active_days >= self.avg_active_days

    # -------------------------
    # ÌéòÎ•¥ÏÜåÎÇò Í≤∞Ï†ï
    # -------------------------
    def get_persona(self, metrics):
        level = self.get_activity_level(metrics["avg_calories"])
        stable = self.is_stable(metrics["active_days"])
    
        if level == "low":
            return "newbie"
        elif level == "moderate":
            return "turtle" if stable else "beginner"
        elif level == "active":
            return "ideal" if stable else "burst"
        else:
            return "veteran" if stable else "lazy_genius"

    # -------------------------
    # Î™©Ìëú
    # -------------------------
    def get_goals(self, persona_key):
        goals = {
            "newbie": {
                "short": "ÌïòÎ£® 3,000Î≥¥ Îã¨ÏÑ± Ï£º 3Ìöå",
                "medium": "1500 kcal Í∑∏Î£π ÏßÑÏûÖ",
                "long": "BeginnerÎ°ú ÏÑ±Ïû•",
                "programs": ["3Î∂Ñ Ïä§Ìä∏Î†àÏπ≠", "Ï∂úÌá¥Í∑ºÍ∏∏ Í±∑Í∏∞"]
            },
            "beginner": {
                "short": "Ï£º 3Ìöå Ïö¥Îèô",
                "medium": "Ïùº ÌèâÍ∑† 5,000Î≥¥",
                "long": "Turtle ÎòêÎäî Burst",
                "programs": ["Í±∑Í∏∞ Ï±åÎ¶∞ÏßÄ", "ÌôàÌä∏"]
            },
            "turtle": {
                "short": "Í≥†Í∞ïÎèÑ 1Ìöå Ï∂îÍ∞Ä",
                "medium": "2000 kcal ÏßÑÏûÖ",
                "long": "Ideal",
                "programs": ["Ïù∏ÌÑ∞Î≤å", "Í≥ÑÎã®"]
            },
            "burst": {
                "short": "Ï£º 5Ìöå Ïö¥Îèô",
                "medium": "ÏùºÏ†ï Í≥†Ï†ï",
                "long": "Ideal",
                "programs": ["ÏäµÍ¥Ä Ï±åÎ¶∞ÏßÄ"]
            },
            "ideal": {
                "short": "ÌòÑ ÏàòÏ§Ä Ïú†ÏßÄ",
                "medium": "Ïö¥Îèô Îã§ÏñëÌôî",
                "long": "Veteran",
                "programs": ["ÌÅ¨Î°úÏä§Ìïè", "ÏöîÍ∞Ä"]
            },
            "lazy_genius": {
                "short": "ÎπàÎèÑ Ï¶ùÍ∞Ä",
                "medium": "Ïó∞ÏÜç 7Ïùº",
                "long": "Veteran",
                "programs": ["ÏïåÎ¶º"]
            },
            "veteran": {
                "short": "Î∂ÄÏÉÅ ÏòàÎ∞©",
                "medium": "ÌöåÎ≥µ Í¥ÄÎ¶¨",
                "long": "Ï†ÑÎ¨∏Ìôî",
                "programs": ["Ïä§Ìä∏Î†àÏπ≠", "ÏΩîÏπ≠"]
            }
        }
        return goals.get(persona_key, goals["newbie"])
    
    def tweak_goals_by_streak(self, goals, metrics): #Ïó∞ÏÜç ÏùºÏàò Í∏∞Ï§Ä Î™©Ìëú ÏÑ∏Î∂ÑÌôî
        a = metrics.get("active_days", 0)
        s = metrics.get("max_streak", 0)

        if a == 0:
            return goals

        if s >= 10:
            goals["short"] = "Ïó∞ÏÜç Í∏∞Î°ù Ïú†ÏßÄ + ÌïòÎ£® 10Î∂Ñ Ïä§Ìä∏Î†àÏπ≠"
            goals["programs"] = list(dict.fromkeys(["ÌöåÎ≥µ Ïä§Ìä∏Î†àÏπ≠"] + goals["programs"]))

        elif s <= 2 and a >= 10:
            goals["short"] = "Ïó∞ÏÜç 3Ïùº ÎßåÎì§Í∏∞ (Í∞ÄÎ≤ºÏö¥ Î£®Ìã¥Î∂ÄÌÑ∞)"
            goals["programs"] = list(dict.fromkeys(["ÏïåÎ¶º ÏÑ§Ï†ï", "ÏÇ∞Ï±Ö Î£®Ìã¥"] + goals["programs"]))

        return goals

    # -------------------------
    # ÌïµÏã¨ Î∂ÑÏÑù Ìï®Ïàò
    # -------------------------
    def analyze_user(self, df, user_id):
        metrics = self.get_user_metrics(df, user_id)
        persona_key = self.get_persona(metrics)
        goals = self.get_goals(persona_key)

        return {
            "user_id": user_id,
            "persona_key": persona_key,
            "metrics": metrics,
            **goals
        }

    # -------------------------
    # Î¶¨Ìè¨Ìä∏
    # -------------------------
    def generate_report(self, result):
        m = result["metrics"]

        report = f"""
===============================
 BellaBuddy ÏõîÍ∞Ñ Î¶¨Ìè¨Ìä∏
===============================

ID : {result["user_id"]}
ÌéòÎ•¥ÏÜåÎÇò: {result["persona_key"]}

Ï¥ù Í±∏ÏùåÏàò: {int(m["total_steps"]):,} Î≥¥
Ï¥ù Í±∞Î¶¨: {m["total_distance"]:.1f} km
Ï¥ù ÏπºÎ°úÎ¶¨: {int(m["total_calories"]):,} kcal
ÌôúÎèôÏùºÏàò: {m["active_days"]}Ïùº
ÏµúÎåÄ Ïó∞ÏÜç: {m["max_streak"]}Ïùº

[Î™©Ìëú]
1Ï£º: {result["short"]}
1Í∞úÏõî: {result["medium"]}
3Í∞úÏõî: {result["long"]}

[Ï∂îÏ≤ú ÌîÑÎ°úÍ∑∏Îû®]
"""
        for p in result["programs"]:
            report += f"- {p}\n"

        return report


In [79]:
def analyze_single_user(df, user_id):
    classifier = PersonaClassifier()
    result = classifier.analyze_user(df, user_id)
    report = classifier.generate_report(result)
    print(report)
    return result


In [80]:
analyze_single_user(act, "2320127002")



 BellaBuddy ÏõîÍ∞Ñ Î¶¨Ìè¨Ìä∏

ID : 2320127002
ÌéòÎ•¥ÏÜåÎÇò: turtle

Ï¥ù Í±∏ÏùåÏàò: 179,455 Î≥¥
Ï¥ù Í±∞Î¶¨: 195.2 km
Ï¥ù ÏπºÎ°úÎ¶¨: 60,826 kcal
ÌôúÎèôÏùºÏàò: 34Ïùº
ÏµúÎåÄ Ïó∞ÏÜç: 23Ïùº

[Î™©Ìëú]
1Ï£º: Í≥†Í∞ïÎèÑ 1Ìöå Ï∂îÍ∞Ä
1Í∞úÏõî: 2000 kcal ÏßÑÏûÖ
3Í∞úÏõî: Ideal

[Ï∂îÏ≤ú ÌîÑÎ°úÍ∑∏Îû®]
- Ïù∏ÌÑ∞Î≤å
- Í≥ÑÎã®



{'user_id': '2320127002',
 'persona_key': 'turtle',
 'metrics': {'avg_calories': 1789.0,
  'avg_steps': 5278.088235294118,
  'total_calories': 60826.0,
  'total_steps': 179455.0,
  'total_distance': 195.23,
  'active_days': 34,
  'max_streak': 23},
 'short': 'Í≥†Í∞ïÎèÑ 1Ìöå Ï∂îÍ∞Ä',
 'medium': '2000 kcal ÏßÑÏûÖ',
 'long': 'Ideal',
 'programs': ['Ïù∏ÌÑ∞Î≤å', 'Í≥ÑÎã®']}

In [81]:
ids = sorted(act["Id"].astype(str).unique().tolist())
ids_df = pd.DataFrame(ids, columns=["Id"])
ids_df

Unnamed: 0,Id
0,1503960366
1,1624580081
2,1644430081
3,1844505072
4,1927972279
5,2022484408
6,2026352035
7,2320127002
8,2347167796
9,2873212765


In [82]:
#Ï†ÑÏ≤¥ ÏùºÏûê vs 'day type'Í∞Ä "active day"ÏùºÎïå Îç∞Ïù¥ÌÑ∞ Í∞ØÏàò ÎπÑÍµê 
check = (
    act.groupby("Id")
       .apply(lambda x: pd.Series({
           "days_all": x["ActivityDate"].nunique(),
           "days_active": x[x["DayType"]=="Active Day"]["ActivityDate"].nunique()
       }))
       .reset_index()
)

check["diff"] = check["days_all"] - check["days_active"]

check[["days_all","days_active","diff"]].describe()


Unnamed: 0,days_all,days_active,diff
count,33.0,33.0,33.0
mean,22.727273,22.424242,0.30303
std,10.54832,10.650793,0.809508
min,3.0,3.0,0.0
25%,14.0,14.0,0.0
50%,24.0,22.0,0.0
75%,30.0,30.0,0.0
max,40.0,40.0,3.0


In [83]:
def analyze_all_personas(df):
    """Î™®Îì† ÏÇ¨Ïö©ÏûêÏùò ÌéòÎ•¥ÏÜåÎÇò Î∂ÑÌè¨"""
    classifier = PersonaClassifier(avg_active_days=32)
    user_ids = df["Id"].astype(str).unique()
    
    results = []
    for user_id in user_ids:
        result = classifier.analyze_user(df, user_id)
        results.append({
            "user_id": user_id,
            "persona": result["persona_key"],
            "avg_calories": result["metrics"]["avg_calories"],
            "active_days": result["metrics"]["active_days"]
        })
    
    results_df = pd.DataFrame(results)
    
    print("üìä ÌéòÎ•¥ÏÜåÎÇò Î∂ÑÌè¨:")
    print(results_df["persona"].value_counts())
    print("\n" + "="*50)
    
    return results_df

# Ïã§Ìñâ
all_personas = analyze_all_personas(act)
all_personas

üìä ÌéòÎ•¥ÏÜåÎÇò Î∂ÑÌè¨:
persona
lazy_genius    12
burst           8
beginner        5
turtle          4
newbie          2
ideal           2
Name: count, dtype: int64



Unnamed: 0,user_id,persona,avg_calories,active_days
0,1503960366,turtle,1867.388889,36
1,1624580081,newbie,1446.885714,35
2,1644430081,lazy_genius,2667.818182,20
3,1844505072,beginner,1836.92,25
4,1927972279,lazy_genius,2553.571429,7
5,2022484408,burst,2392.625,24
6,2026352035,turtle,1549.589744,39
7,2320127002,turtle,1789.0,34
8,2347167796,burst,2143.590909,22
9,2873212765,turtle,1911.820513,39
