In [31]:
## 1. 환경 설정

import pandas as pd
from datetime import timedelta

## 2. 데이터 로드

subscriptions = pd.read_csv('../data/processed/subscriptions.csv')
project_progress_details = pd.read_csv('../data/processed/project_progress_details.csv')
track_user_step_check_logs = pd.read_csv('../data/processed/track_user_step_check_logs.csv')
track_users = pd.read_csv('../data/processed/track_users.csv')

## 3. 로그 데이터 병합 및 전처리

# 월 구독자만 필터링
# 목적: 월간 구독만 한 사용자를 식별, 혼합 구독(M과 Y 모두) 사용자 제외
valid_users = subscriptions.groupby('user_id')['구독_타입'].nunique() == 1
subscriptions = subscriptions[subscriptions['user_id'].isin(valid_users[valid_users].index)]
subscriptions = subscriptions[subscriptions['구독_타입'] == 'M']
subscriptions = subscriptions.drop(['현재_구독_상태', '구독_타입'], axis=1)

# 날짜 변환
subscriptions['구독_시작일'] = pd.to_datetime(subscriptions['구독_시작일'])
subscriptions['구독_종료일'] = pd.to_datetime(subscriptions['구독_종료일'])
subscriptions['해지_신청일'] = pd.to_datetime(subscriptions['해지_신청일'])

# 로그 데이터 병합
start_date = pd.to_datetime('2025-01-29') #pd.to_datetime('today').normalize() # pd.to_datetime('2025-06-04')
project_progress_details['created_at'] = pd.to_datetime(project_progress_details['created_at'])
track_user_step_check_logs['created_at'] = pd.to_datetime(track_user_step_check_logs['created_at'])
track_user_step_check_logs = track_user_step_check_logs.rename(columns={'stage_number': 'stage'})

project_progress_details = project_progress_details[project_progress_details['created_at'] < start_date]
track_user_step_check_logs = track_user_step_check_logs[track_user_step_check_logs['created_at'] < start_date]

## 트랙 관련 데이터 제외
## - project_progress_details에서 trackId가 결측인(NA) 데이터만 선택
## - 목적: 프로젝트 진행 로그 중 트랙과 무관한 데이터만 분석, 트랙 로그와 중복 방지
## - 이유: project_progress_details는 프로젝트 학습 이력을 기록하지만, trackId 또는 trackProgressId가 있는 행은 트랙 학습에서 생성된 데이터이므로 제외
project_progress_details = project_progress_details[project_progress_details['trackId'].isna()]

## 데이터 소스 식별
## - 목적: 병합 후 데이터 출처 구분, 추후 분석 시 소스별 패턴 확인 가능
project_progress_details['source'] = 'project_progress'
track_user_step_check_logs['source'] = 'track_logs'

project_cols = ['user_id', 'project_id', 'stage', 'created_at', 'source']
track_cols = ['user_id', 'project_id', 'stage', 'created_at', 'track_id', 'source']

## 로그 데이터 병합
## - 목적: combined_logs는 프로젝트 및 트랙 활동 로그를 통합, source 컬럼으로 출처 구분
combined_logs = pd.concat([project_progress_details[project_cols],track_user_step_check_logs[track_cols]], ignore_index=True)

# 트랙 난이도 및 기수 병합
track_difficulty_df = track_users[['track_id', '트랙_난이도']].drop_duplicates()
combined_logs = combined_logs.merge(track_difficulty_df, on='track_id', how='left')
track_date_df = track_users[['track_id', '기수']].drop_duplicates()
combined_logs = combined_logs.merge(track_date_df, on='track_id', how='left')

In [32]:
combined_logs

Unnamed: 0,user_id,project_id,stage,created_at,source,track_id,트랙_난이도,기수
0,448522,1001,1,2022-12-01 12:20:08,project_progress,,,
1,448522,1001,1,2022-12-01 12:20:11,project_progress,,,
2,448522,1001,1,2022-12-01 14:02:57,project_progress,,,
3,448522,1001,1,2022-12-01 14:03:00,project_progress,,,
4,448522,1001,1,2022-12-01 14:03:48,project_progress,,,
...,...,...,...,...,...,...,...,...
153670,517824,305,16,2025-01-28 23:39:12,track_logs,236426.0,,
153671,517824,305,16,2025-01-28 23:39:46,track_logs,236426.0,,
153672,517824,305,16,2025-01-28 23:40:25,track_logs,236426.0,,
153673,517824,305,16,2025-01-28 23:41:03,track_logs,236426.0,,


### combined_logs

Columns:
- user_id: 사용자 ID (예: 448522, 513121)
- project_id: 프로젝트 ID (예: 1001, 1010)
- stage: 학습 단계 (예: 1, 4)
- created_at: 활동 기록 시간 (예: 2022-12-01 12:20:08, 2024-10-08 23:59:53)
- source: 데이터 출처 ('project_progress' 또는 'track_logs')
- track_id: 트랙 ID (프로젝트 로그는 NaN, 트랙 로그는 값 존재, 예: 236384.0)
- 트랙_난이도: 트랙 난이도 (예: NaN, 데이터에 값 없음)
- 기수: 트랙 기수 (예: NaN, 데이터에 값 없음)

Shape: 122,679 rows × 8 columns

In [33]:
## 4. 통계 계산

def calculate_stats(logs, group_cols, agg_col, prefix):
    """그룹별 통계(평균, 표준편차)를 계산."""
    stats = logs.groupby(group_cols)[agg_col].agg(['mean', 'std']).reset_index()
    stats.columns = group_cols + [f'mean_{prefix}', f'std_{prefix}']
    return stats

# 프로젝트별 스텝 체크 횟수 통계 계산 (첫 번째 방식) 
## 각 프로젝트별로 스텝 체크를 누른 횟수의 평균값과 표준편차
##  ex) 프로젝트 ID가 1001인 프로젝트의 사용자들의  스텝 체크 횟수의 평균(mean)과 표준편차 (std)
project_stats = combined_logs.groupby(['user_id', 'project_id']).size().reset_index(name='step_count')

## - 출력 컬럼: project_id, mean_step_count (프로젝트별 평균 스텝 체크 횟수), std_step_count (표준편차)
project_summary = calculate_stats(project_stats, ['project_id'], 'step_count', 'step_count')

# 프로젝트/트랙 별 프로젝트 통계 계산 후 병합 (두 번째 방식)
project_stats_by_source = project_progress_details.groupby(['user_id', 'project_id']).size().reset_index(name='step_count')
project_summary_by_source = calculate_stats(project_stats_by_source, ['project_id'], 'step_count', 'step_count')
project_summary_by_source['source'] = 'project_progress'

track_stats = track_user_step_check_logs.groupby(['user_id', 'project_id']).size().reset_index(name='step_count')
track_summary = calculate_stats(track_stats, ['project_id'], 'step_count', 'step_count')
track_summary['source'] = 'track_logs'

project_stats_combined = pd.concat([project_summary_by_source, track_summary], ignore_index=True)

# 난이도별 통계 계산 (difficulty_zscore)
difficulty_stats = combined_logs.groupby(['user_id', '트랙_난이도']).size().reset_index(name='step_count')
difficulty_summary = calculate_stats(difficulty_stats, ['트랙_난이도'], 'step_count', 'step_count')

# 기수별 통계 계산 (number_zscore)
number_stats = combined_logs.groupby(['user_id', '기수']).size().reset_index(name='step_count')
number_summary = calculate_stats(number_stats, ['기수'], 'step_count', 'step_count')

# 프로젝트별 진행률 통계 (progress_zscore_by_project)
progress_stats = combined_logs.groupby(['user_id', 'project_id'])['stage'].max().reset_index(name='max_stage')
progress_summary = calculate_stats(progress_stats, ['project_id'], 'max_stage', 'max_stage')

# 프로젝트/트랙 별 프로젝트 진행률 통계 계산 후 병합 (progress_zscore_by_source)
# project_progress 진행률 통계
progress_stats_by_source = project_progress_details.groupby(['user_id', 'project_id'])['stage'].max().reset_index(name='max_stage')
progress_summary_by_source = calculate_stats(progress_stats_by_source, ['project_id'], 'max_stage', 'max_stage')
progress_summary_by_source['source'] = 'project_progress'

# track_logs 진행률 통계
track_progress_stats = track_user_step_check_logs.groupby(['user_id', 'project_id'])['stage'].max().reset_index(name='max_stage')
track_progress_summary = calculate_stats(track_progress_stats, ['project_id'], 'max_stage', 'max_stage')
track_progress_summary['source'] = 'track_logs'

# 진행률 통계 병합
progress_stats_combined = pd.concat([progress_summary_by_source, track_progress_summary], ignore_index=True)

In [34]:
# ## 5. Z-스코어 기반 피처 생성
# 사용자별 활동 및 진행률 Z-스코어 피처 생성 함수
# - 입력:
#   - combined_logs: 프로젝트/트랙 로그 통합 (122,679 행, source로 구분)
#   - subscriptions: 구독 데이터 (user_id, 구독_시작일, 구독_종료일)
#   - project_summary: 프로젝트별 step_count의 mean, std
#   - project_stats_combined: 프로젝트/트랙별 step_count의 mean, std (source 구분)
#   - difficulty_summary: 트랙 난이도별 step_count의 mean, std
#   - number_summary: 기수별 step_count의 mean, std
#   - progress_summary: 프로젝트별 max_stage의 mean, std
#   - progress_stats_combined: 프로젝트/트랙별 max_stage의 mean, std (source 구분)

def aggregate_learning_logs(combined_logs, subscriptions, project_summary, project_stats_combined, 
                           difficulty_summary, number_summary, progress_summary, progress_stats_combined):
    """사용자별 활동 및 진행률 Z-스코어 피처를 생성."""
    aggregated_data = []
    for idx, sub_row in subscriptions.iterrows():
        user_id = sub_row['user_id']
        start_date = sub_row['구독_시작일']
        end_date = sub_row['구독_종료일']
        # 해당 구독 기간 내 로그 필터링
        user_logs = combined_logs[(combined_logs['user_id'] == user_id) & 
                                 (combined_logs['created_at'] >= start_date) & 
                                 (combined_logs['created_at'] <= end_date)]
        # 로그가 없는 경우: Z-스코어 피처 None으로 설정
        if user_logs.empty:
            activity_zscore_by_project = None
            activity_zscore_by_source = None
            activity_zscore_by_difficulty = None
            activity_zscore_by_number = None
            progress_zscore_by_project = None
            progress_zscore_by_source = None
        else:
            # 프로젝트별 활동 Z-스코어
            user_project_counts = user_logs.groupby('project_id').size().reset_index(name='activity_count')
            user_project_counts = user_project_counts.merge(project_summary, on='project_id', how='left')
            user_project_counts['zscore_activity'] = (
                (user_project_counts['activity_count'] - user_project_counts['mean_step_count']) / 
                user_project_counts['std_step_count']
            )
            # activity_zscore_by_project: 특정 유저의 특정 기간 동안 project_id별 zscore_activity의 평균
            activity_zscore_by_project = user_project_counts['zscore_activity'].mean()
            
            ##########################################################################################################
            ##########################################################################################################
            
            # 소스별 활동 Z-스코어
            user_project_source_counts = user_logs.groupby(['project_id', 'source']).size().reset_index(name='activity_count')
            valid_stats = project_stats_combined[['project_id', 'source']].drop_duplicates()
            user_project_source_counts = user_project_source_counts.merge(
                valid_stats, on=['project_id', 'source'], how='inner'
            )
            user_project_source_counts = user_project_source_counts.merge(
                project_stats_combined, on=['project_id', 'source'], how='left'
            )
            user_project_source_counts['zscore_activity'] = (
                (user_project_source_counts['activity_count'] - user_project_source_counts['mean_step_count']) / 
                user_project_source_counts['std_step_count']
            )
            activity_zscore_by_source = user_project_source_counts['zscore_activity'].mean()
            
            # 난이도별 활동 Z-스코어
            user_difficulty_counts = user_logs.groupby('트랙_난이도').size().reset_index(name='activity_count')
            user_difficulty_counts = user_difficulty_counts.dropna(subset=['트랙_난이도'])
            if not user_difficulty_counts.empty:
                user_difficulty_counts = user_difficulty_counts.merge(
                    difficulty_summary, on='트랙_난이도', how='left'
                )
                user_difficulty_counts['zscore_activity'] = (
                    (user_difficulty_counts['activity_count'] - user_difficulty_counts['mean_step_count']) / 
                    user_difficulty_counts['std_step_count']
                )
                activity_zscore_by_difficulty = user_difficulty_counts['zscore_activity'].mean()
            else:
                activity_zscore_by_difficulty = None
                
            # 기수별 활동 Z-스코어
            user_number_counts = user_logs.groupby('기수').size().reset_index(name='activity_count')
            user_number_counts = user_number_counts.dropna(subset=['기수'])
            if not user_number_counts.empty:
                user_number_counts = user_number_counts.merge(number_summary, on='기수', how='left')
                user_number_counts['zscore_activity'] = (
                    (user_number_counts['activity_count'] - user_number_counts['mean_step_count']) / 
                    user_number_counts['std_step_count']
                )
                activity_zscore_by_number = user_number_counts['zscore_activity'].mean()
            else:
                activity_zscore_by_number = None
                
            # 프로젝트별 진행률 Z-스코어
            user_progress = user_logs.groupby('project_id')['stage'].max().reset_index(name='max_stage')
            if not user_progress.empty:
                user_progress = user_progress.merge(progress_summary, on='project_id', how='left')
                user_progress['zscore_progress'] = (
                    (user_progress['max_stage'] - user_progress['mean_max_stage']) / 
                    user_progress['std_max_stage']
                )
                progress_zscore_by_project = user_progress['zscore_progress'].mean()
            else:
                progress_zscore_by_project = None
                
            # 소스별 진행률 Z-스코어
            user_progress_source = user_logs.groupby(['project_id', 'source'])['stage'].max().reset_index(name='max_stage')
            valid_progress_stats = progress_stats_combined[['project_id', 'source']].drop_duplicates()
            user_progress_source = user_progress_source.merge(
                valid_progress_stats, on=['project_id', 'source'], how='inner'
            )
            if not user_progress_source.empty:
                user_progress_source = user_progress_source.merge(
                    progress_stats_combined, on=['project_id', 'source'], how='left'
                )
                user_progress_source['zscore_progress'] = (
                    (user_progress_source['max_stage'] - user_progress_source['mean_max_stage']) / 
                    user_progress_source['std_max_stage']
                )
                progress_zscore_by_source = user_progress_source['zscore_progress'].mean()
            else:
                progress_zscore_by_source = None
        
        aggregated_data.append({
            'user_id': user_id, 
            '구독_시작일': start_date, 
            '구독_종료일': end_date,
            'activity_count': len(user_logs), 
            'activity_zscore_by_project': activity_zscore_by_project,
            'activity_zscore_by_source': activity_zscore_by_source, 
            'activity_zscore_by_difficulty': activity_zscore_by_difficulty,
            'activity_zscore_by_number': activity_zscore_by_number, 
            'progress_zscore_by_project': progress_zscore_by_project,
            'progress_zscore_by_source': progress_zscore_by_source, 
            'unique_projects': user_logs['project_id'].nunique(),
            'last_activity_gap': (
                (end_date - user_logs['created_at'].max()).days 
                if not user_logs.empty else (end_date - start_date).days
            ),
            'track_participation': (user_logs['source'] == 'track_logs').sum() > 0
        })
    
    return pd.DataFrame(aggregated_data)

In [35]:
import numpy as np

# 훈련/테스트 데이터 분리
reference_date = start_date # 24년 10월 9일
target_end_date = reference_date + timedelta(days=7) # 24년 10월 16일
train_subscriptions = subscriptions[subscriptions["구독_종료일"] <= reference_date].copy() # 10월 9일 이전
test_subscriptions = subscriptions[ #
    (subscriptions['구독_종료일'] <= target_end_date) & 
    (subscriptions['구독_종료일'] >= reference_date)
].copy()

# 피처 집계
train_aggregated_logs = aggregate_learning_logs(
    combined_logs, train_subscriptions, project_summary, project_stats_combined,
    difficulty_summary, number_summary, progress_summary, progress_stats_combined
)
test_aggregated_logs = aggregate_learning_logs(
    combined_logs, test_subscriptions, project_summary, project_stats_combined,
    difficulty_summary, number_summary, progress_summary, progress_stats_combined
)

# 병합
train_subscriptions = train_subscriptions.merge(
    train_aggregated_logs, on=['user_id', '구독_시작일', '구독_종료일'], how='left'
)
test_subscriptions = test_subscriptions.merge(
    test_aggregated_logs, on=['user_id', '구독_시작일', '구독_종료일'], how='left'
)

cols = ['progress_zscore_by_source', 'progress_zscore_by_project']
train_subscriptions[cols] = train_subscriptions[cols].replace([np.inf, -np.inf], np.nan) ######## 무한대가 어디서 나왔을 까??????
test_subscriptions[cols] = test_subscriptions[cols].replace([np.inf, -np.inf], np.nan)

In [36]:
## 6. 추가 피처 생성

def extract_date_features(df, col):
    """날짜 컬럼에서 월, 일, 요일 피처를 추출."""
    df[col] = pd.to_datetime(df[col], errors='coerce')
    df[f'{col}_month'] = df[col].dt.month
    df[f'{col}_day'] = df[col].dt.day
    df[f'{col}_weekday'] = df[col].dt.dayofweek
    return df

for col in ['구독_시작일', '구독_종료일']:
    train_subscriptions = extract_date_features(train_subscriptions, col)
    test_subscriptions = extract_date_features(test_subscriptions, col)

train_subscriptions['해지_신청일'] = train_subscriptions['해지_신청일'].fillna('2000-12-31')
test_subscriptions['해지_신청일'] = test_subscriptions['해지_신청일'].fillna('2000-12-31')
train_subscriptions['해지일_diff'] = (
    train_subscriptions['해지_신청일'] - train_subscriptions['구독_시작일']
).dt.days
test_subscriptions['해지일_diff'] = (
    test_subscriptions['해지_신청일'] - test_subscriptions['구독_시작일']
).dt.days
train_subscriptions.loc[train_subscriptions['해지_신청일'] == pd.Timestamp('2000-12-31'), '해지일_diff'] = 0
test_subscriptions.loc[test_subscriptions['해지_신청일'] == pd.Timestamp('2000-12-31'), '해지일_diff'] = 0


In [37]:
## 7. 결측치 처리
zscore_by_project_mean = train_subscriptions['activity_zscore_by_project'].mean()
zscore_by_source_mean = train_subscriptions['activity_zscore_by_source'].mean()
zscore_by_difficulty_mean = train_subscriptions['activity_zscore_by_difficulty'].mean()
zscore_by_number_mean = train_subscriptions['activity_zscore_by_number'].mean()
progress_zscore_by_project_mean = train_subscriptions['progress_zscore_by_project'].mean()
progress_zscore_by_source_mean = train_subscriptions['progress_zscore_by_source'].mean()

train_subscriptions['activity_zscore_by_project'] = train_subscriptions['activity_zscore_by_project'].fillna(zscore_by_project_mean)
train_subscriptions['activity_zscore_by_source'] = train_subscriptions['activity_zscore_by_source'].fillna(zscore_by_source_mean)
train_subscriptions['progress_zscore_by_project'] = train_subscriptions['progress_zscore_by_project'].fillna(progress_zscore_by_project_mean)
train_subscriptions['progress_zscore_by_source'] = train_subscriptions['progress_zscore_by_source'].fillna(progress_zscore_by_source_mean)

test_subscriptions['activity_zscore_by_project'] = test_subscriptions['activity_zscore_by_project'].fillna(zscore_by_project_mean)
test_subscriptions['activity_zscore_by_source'] = test_subscriptions['activity_zscore_by_source'].fillna(zscore_by_source_mean)
test_subscriptions['progress_zscore_by_project'] = test_subscriptions['progress_zscore_by_project'].fillna(progress_zscore_by_project_mean)
test_subscriptions['progress_zscore_by_source'] = test_subscriptions['progress_zscore_by_source'].fillna(progress_zscore_by_source_mean)

In [38]:
# 트랙 참여 여부에 따른 결측치 처리
# 트랙에 참여하지 않은 사람들은 0으로, 참여했는데 결측치인 경우는 평균으로 처리

train_subscriptions['activity_zscore_by_difficulty'] = train_subscriptions.apply(
    lambda row: 0 if row['track_participation'] == False else 
    (row['activity_zscore_by_difficulty'] if pd.notna(row['activity_zscore_by_difficulty']) 
     else zscore_by_difficulty_mean), axis=1
)
train_subscriptions['activity_zscore_by_number'] = train_subscriptions.apply(
    lambda row: 0 if row['track_participation'] == False else 
    (row['activity_zscore_by_number'] if pd.notna(row['activity_zscore_by_number']) 
     else zscore_by_number_mean), axis=1
)

test_subscriptions['activity_zscore_by_difficulty'] = test_subscriptions.apply(
    lambda row: 0 if row['track_participation'] == False else 
    (row['activity_zscore_by_difficulty'] if pd.notna(row['activity_zscore_by_difficulty']) 
     else zscore_by_difficulty_mean), axis=1
)
test_subscriptions['activity_zscore_by_number'] = test_subscriptions.apply(
    lambda row: 0 if row['track_participation'] == False else 
    (row['activity_zscore_by_number'] if pd.notna(row['activity_zscore_by_number']) 
     else zscore_by_number_mean), axis=1
)

In [39]:
import numpy as np
import pandas as pd

def check_feature_issues(df):
    # 숫자형 컬럼만 추출
    num_df = df.select_dtypes(include=[np.number])

    # 각 수치형 컬럼에 개별적으로 np.isinf 적용
    inf_count = num_df.apply(lambda x: np.isinf(x).sum())
    ne_inf_count = num_df.apply(lambda x: np.isneginf(x).sum())
    nan_count = num_df.isna().sum()
    max_val = num_df.max()
    min_val = num_df.min()

    result = pd.DataFrame({
        'NaN': nan_count,
        'inf': inf_count,
        '-inf': ne_inf_count,
        'max': max_val,
        'min': min_val
    })

    # 문제 있는 컬럼만 보기
    result = result[(result['NaN'] > 0) | (result['inf'] > 0) | (result['-inf'] > 0)]
    return result

# 사용 예시
print("train_subscriptions 결측/이상치 요약")
display(check_feature_issues(train_subscriptions))

print("\ntest_subscriptions 결측/이상치 요약")
display(check_feature_issues(test_subscriptions))

train_subscriptions 결측/이상치 요약


Unnamed: 0,NaN,inf,-inf,max,min
해지까지_일수,1900,0,0,67.0,0.0



test_subscriptions 결측/이상치 요약


Unnamed: 0,NaN,inf,-inf,max,min
해지까지_일수,12,0,0,30.0,1.0


In [40]:
## 8. 결과 저장

train_subscriptions.to_csv('../data/processed/train_subscriptions.csv', index=False)
test_subscriptions.to_csv('../data/processed/test_subscriptions.csv', index=False)
