In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
subscription = pd.read_csv("./data/subscription.csv")
history = pd.read_csv("./data/history.csv")
subscription.sort_values(by=['sub_start'], inplace=True)

#### 距離流失/未流失日期(sub_end)每七天切分一個欄位，直到訂閱起始日(sub_start)或半年

In [None]:
# 找出最新的 play_date
history.sort_values(by=['play_date'], inplace=True)

# 將 sub_end 為空值的資料補上 2024-01-22
feature = subscription
feature['sub_start'] = pd.to_datetime(feature['sub_start'])
feature['sub_end'] = feature['sub_end'].fillna('2024-01-22')

# 刪掉 sub_end - sub_start 小於等於 7 天的資料
feature = feature[feature['sub_end'] > feature['sub_start'] + pd.Timedelta(days=7)]
feature

In [None]:
# 每七天為一週，從 sub_end 開始，直到 sub_start 或 3 個月結束
feature['sub_start'] = pd.to_datetime(feature['sub_start'])
feature['sub_end'] = pd.to_datetime(feature['sub_end'])
num_weeks = 13

for index, row in feature.iterrows():
    for i in range(num_weeks):
        if row['sub_end'] - pd.DateOffset(days=7*i) < row['sub_start']:
            feature.at[index, f'week{i}'] = np.nan
        else:   
            feature.at[index, f'week{i}'] = row['sub_end'] - pd.DateOffset(days=7*i)

feature


In [None]:
# 只保留有 feature id 的 history 資料
history = history[history['id'].isin(feature['id'])]

# 找出最晚的 week 12
feature.sort_values(by='week12', ascending=True)

# 只保留有 play_date 晚於 2020-11-15 資料
history['play_date'] = pd.to_datetime(history['play_date'])
history = history[history['play_date'] > '2020-11-15']
history

#### 計算每周遊玩狀況

In [None]:
# 計算 week1 - week12 的數據
for i in range(1, 13): 
    print("week", i, "is processing")
    # 新增欄位並初始化為 nan
    feature[f'week{i}_avg_play_minute'] = np.nan 
    feature[f'week{i}_total_row_count'] = np.nan

    # 篩選出不是空值以及包含 id 的資料
    mask = feature[(feature[f'week{i}'].notna()) & (feature[f'week{i-1}'].notna())]
    history1 = history[history['id'].isin(mask['id'])]

    # 篩選 id 相同且 play_date 在 week i 和 week i - 1 之間的資料
    for index, row in tqdm(mask.iterrows()):
        subset = history1[(history1['id'] == row['id']) & (history1['play_date'] > row[f'week{i}']) & (history1['play_date'] <= row[f'week{i-1}'])]

        # 按 id 分組，計算統計數據
        grouped = subset.groupby('id').agg(
            week_avg_play_minute = ('play_minute', 'mean'),
            row_count =('id', 'size'),
        ).reset_index()

        merged = pd.merge(feature, grouped, on='id', how='left')

        # 將計算結果更新到 feature 中
        if not grouped.empty:
            feature.at[index, f'week{i}_avg_play_minute'] = grouped['week_avg_play_minute'].values[0]
            feature.at[index, f'week{i}_total_row_count'] = grouped['row_count'].values[0]
    print("week", i, "is done")

feature


#### 空值處理

In [None]:
feature['sub_start'] = pd.to_datetime(feature['sub_start'])
feature['sub_end'] = pd.to_datetime(feature['sub_end'])

# 計算經過的週數
feature['weeks_passed'] = np.minimum(12, ((feature['sub_end'] - feature['sub_start']).dt.days / 7).astype(int))

# 初始化欄位名稱的列表
columns = feature.columns.tolist()

for index, row in feature.iterrows():
    weeks_passed = row['weeks_passed']
    # 將該週之前的欄位空值補0
    for i in range(weeks_passed+1):
        for column in columns:
            if f'week{i}_' in column:
                if pd.isnull(row[column]):
                    feature.at[index, column] = 0

# 空值補none
feature.fillna('none', inplace=True)

# 刪除 week passed
feature.drop(columns=['weeks_passed'], inplace=True)

#### 計算日平均

In [None]:
for i in range(1, 13):
    col_row_count = f"week{i}_total_row_count"
        
    for index, row in feature.iterrows():
        if isinstance(row[col_row_count], (int, float)):
            feature.at[index, f"week{i}_avg_row_count_by_day"] = row[col_row_count] / (7 * i)
            
feature.fillna('none', inplace=True)

In [None]:
# 寫入檔案
feature.to_csv('./data/feature_week.csv', index=False)