In [173]:
# 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score

train_subscriptions = pd.read_csv('../data/processed/train_subscriptions.csv')
test_subscriptions = pd.read_csv('../data/processed/test_subscriptions.csv')

# 날짜 피처 추출 함수 정의
def extract_date_features(df, col):
    df[col] = pd.to_datetime(df[col], errors='coerce')  # 문자열 → datetime 변환
    df[f'{col}_month'] = df[col].dt.month
    df[f'{col}_day'] = df[col].dt.day
    df[f'{col}_hour'] = df[col].dt.hour
    df[f'{col}_weekday'] = df[col].dt.dayofweek  # 0 = Monday, 6 = Sunday
    return df

# 피처 엔지니어링 및 모델 학습·평가
date_columns = ['구독_시작일', '구독_종료일']

# 날짜 피처 생성 (Train/Test)
for col in date_columns:
    train_subscriptions = extract_date_features(train_subscriptions, col)
    test_subscriptions  = extract_date_features(test_subscriptions, col)

# 해지 신청일 NaN 처리
def fill_cancel_date(df):
    df['해지_신청일'] = df['해지_신청일'].fillna('2000-12-31 00:00:00')
    return df

train_subscriptions = fill_cancel_date(train_subscriptions)
test_subscriptions  = fill_cancel_date(test_subscriptions)

# 구독 시작일부터 해지일까지 기간 계산 (일)
for df in [train_subscriptions, test_subscriptions]:
    df['해지일_diff'] = (pd.to_datetime(df['해지_신청일']) - pd.to_datetime(df['구독_시작일'])).dt.days
    df.loc[df['해지_신청일'] == pd.Timestamp('2000-12-31'), '해지일_diff'] = 0

# 종속변수(y) 생성
y_train = train_subscriptions['구독_상태'].map({'연장': 0, '해지': 1})
y_test  = test_subscriptions['구독_상태'].map({'연장': 0, '해지': 1})

# 독립변수(X) 설정 (불필요 칼럼 제거)
drop_cols = [
    '구독_상태', '구독_종료일', '해지까지_일수', '총_구독_일수',
    '구독_시작일', '해지_신청일', '구독_종료일_hour',
    '구독_종료일_day', '구독_종료일_month',
    '총_구독_횟수', 'track_participation', 'progress_zscore_by_project'
]
X_train = train_subscriptions.drop(columns=drop_cols)
X_test  = test_subscriptions.drop(columns=drop_cols)

In [174]:
# 모델 정의
models = {
    'RandomForest': RandomForestClassifier(random_state=11),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'GradientBoosting': GradientBoostingClassifier(random_state=11),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=11),
    'LightGBM': LGBMClassifier(random_state=11)
}

# 실험 결과 저장용 리스트
experiment_results = []

# 모델 학습 및 평가 루프
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    experiment_results.append({
        'model': name,
        'accuracy': accuracy_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred)
    })

# 결과 DataFrame 변환 및 분석
experiment_df = pd.DataFrame(experiment_results)
display(experiment_df.sort_values('f1_score', ascending=False))

# 최고 f1_score 모델 출력
best_model_name = experiment_df.loc[experiment_df['f1_score'].idxmax(), 'model']
print(f"▶️ 최고 f1_score 모델: {best_model_name}")
best_model = models[best_model_name]



[LightGBM] [Info] Number of positive: 1614, number of negative: 1163
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1797
[LightGBM] [Info] Number of data points in the train set: 2777, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.581203 -> initscore=0.327713
[LightGBM] [Info] Start training from score 0.327713


Unnamed: 0,model,accuracy,f1_score
1,LogisticRegression,0.956522,0.952381
0,RandomForest,0.913043,0.909091
2,GradientBoosting,0.869565,0.869565
3,XGBoost,0.826087,0.833333
4,LightGBM,0.826087,0.833333


▶️ 최고 f1_score 모델: LogisticRegression


In [176]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score  # precision_score 추가

# 모델 정의
models = {
    'RandomForest': RandomForestClassifier(random_state=11),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'GradientBoosting': GradientBoostingClassifier(random_state=11),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=11),
    'LightGBM': LGBMClassifier(random_state=11)
}

# 실험 결과 저장용 리스트
experiment_results = []

# 모델 학습 및 평가 루프
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    experiment_results.append({
        'model': name,
        'accuracy': accuracy_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),  # 이전에 추가된 Recall
        'precision': precision_score(y_test, y_pred),  # Precision 추가
        'f1_score': f1_score(y_test, y_pred)
    }) 

# 결과 DataFrame 변환 및 분석
experiment_df = pd.DataFrame(experiment_results)
display(experiment_df.sort_values('f1_score', ascending=False))

# 최고 f1_score 모델 출력
best_model_name = experiment_df.loc[experiment_df['f1_score'].idxmax(), 'model']
print(f"▶️ 최고 f1_score 모델: {best_model_name}")
best_model = models[best_model_name]



[LightGBM] [Info] Number of positive: 1614, number of negative: 1163
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000861 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1797
[LightGBM] [Info] Number of data points in the train set: 2777, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.581203 -> initscore=0.327713
[LightGBM] [Info] Start training from score 0.327713


Unnamed: 0,model,accuracy,recall,precision,f1_score
1,LogisticRegression,0.956522,1.0,0.909091,0.952381
0,RandomForest,0.913043,1.0,0.833333,0.909091
2,GradientBoosting,0.869565,1.0,0.769231,0.869565
3,XGBoost,0.826087,1.0,0.714286,0.833333
4,LightGBM,0.826087,1.0,0.714286,0.833333


▶️ 최고 f1_score 모델: LogisticRegression


# 구독 해지 예측 사용자 메일링 리스트 생성

In [177]:
# import datetime
# from pathlib import Path

# # 1) 모델 예측 (확률, 예측값)
# y_pred_proba = best_model.predict_proba(X_test)[:, 1]
# y_pred = best_model.predict(X_test)

# # 2) 운영용 DataFrame 생성: user_id ➡ test_meta에서 가져온 구독_시작일 ➡ 이메일
# oper_df = X_test[['user_id']].copy()

# # 2-1) test_meta와 병합해서 원본 구독_시작일(datetime) 붙이기
# test_subscriptions = pd.read_csv('../data/processed/test_subscriptions.csv', parse_dates=['구독_시작일'])
# test_meta = test_subscriptions[['user_id', '구독_시작일']].copy()
# oper_df = oper_df.merge(test_meta[['user_id', '구독_시작일']], on  ='user_id', how ='left')

# # 2-2) 이메일 정보 병합
# user_mail = pd.read_csv('../data/processed/user.csv')
# oper_df = oper_df.merge(user_mail[['user_id', 'mail']], on  ='user_id', how ='left')

# # 3) 모델 예측 결과를 컬럼으로 추가
# oper_df['해지할 확률'] = y_pred_proba
# oper_df['구독_상태 예측'] = np.where(y_pred == 1, '해지', '연장')

# # 4) 운영용 필터링: “해지”로 예측된 유저만 추출
# mail_list = oper_df[oper_df['구독_상태 예측'] == '해지'][['user_id', 'mail', '구독_시작일', '해지할 확률']].copy()

# # 5) 컬럼명 한글화 (원하시는 대로 바꿔주세요)
# mail_list = mail_list.rename(columns={
#     'user_id'    : '사용자_ID',
#     'email'      : '이메일',
#     '구독_시작일' : '구독_시작일',
#     '해지할 확률' : '해지할 확률'
# })

# mail_list = mail_list.sort_values(by = '구독_시작일')
# display(mail_list)

# # 6) CSV 저장 (운영용)
# today_str = pd.to_datetime('2025-02-26').strftime('%Y-%m-%d')  # pd.to_datetime('today').normalize()

# base = Path("../data/results")
# mail_dir = base / "mail"
# mail_list.to_csv(mail_dir / f"{today_str}_mail_list.csv", index=False)

# 구독 해지 예측 디버깅 테이블 생성

In [178]:
# debug_df = X_test[['user_id']].copy()
# debug_df = debug_df.merge(test_meta[['user_id', '구독_시작일']], on  ='user_id', how ='left')
# debug_df = debug_df.merge(user_mail[['user_id', 'mail']], on  ='user_id', how ='left')

# debug_df['해지할 확률'] = y_pred_proba
# debug_df['구독 상태 예측'] = np.where(y_pred == 1, '해지', '연장')
# debug_df['구독 상태 실제 결과'] = np.where(y_test.values == 1, '해지', '연장')

# debug_df = debug_df.rename(columns={
#     'user_id'             : '사용자_ID',
#     'mail'               : '이메일',
#     '구독_시작일'          : '구독_시작일',
#     '해지할 확률'          : '해지할 확률',
#     '구독 상태 예측'       : '구독 상태 예측',
#     '구독 상태 실제 결과'  : '구독 상태 실제 결과'
# })
# debug_df = debug_df.sort_values(by = '구독_시작일')
# display(debug_df)

# debug_dir = base / "debug"
# # f1_score 기준 내림차순 정렬
# score = experiment_df.sort_values('f1_score', ascending=False)

# # 최고 점수, 모델명 추출
# best_model_name = score.iloc[0]['model']
# best_f1_score = score.iloc[0]['f1_score']

# # 파일명 만들기
# filename = f"{today_str}_prediction_debug_{best_model_name}_{best_f1_score:.4f}.csv"

# # 저장
# debug_df.to_csv(debug_dir / filename, index=False)