This script generates weekly user-play features by splitting subscription periods into 7-day windows, computing per-week average play minutes and play counts, handling missing values.

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

#### Import data

In [None]:
subscription = pd.read_csv("./data/subscription.csv")
history = pd.read_csv("./data/history.csv")
subscription.sort_values(by=['sub_start'], inplace=True)

#### Segment the period

In [None]:
# Identify the latest 'play_date'
history.sort_values(by=['play_date'], inplace=True)

# Fill missing 'sub_end' values with '2024-01-22'
feature = subscription
feature['sub_start'] = pd.to_datetime(feature['sub_start'])
feature['sub_end'] = feature['sub_end'].fillna('2024-01-22')

# Remove data where the duration (sub_end - sub_start) is less than or equal to 7 days
feature = feature[feature['sub_end'] > feature['sub_start'] + pd.Timedelta(days=7)]

# ivide the period into weekly segments (7 days), starting from 'sub_end' and extending backward until 'sub_start' or a maximum of 3 months is reached
feature['sub_start'] = pd.to_datetime(feature['sub_start'])
feature['sub_end'] = pd.to_datetime(feature['sub_end'])
num_weeks = 13

for index, row in feature.iterrows():
    for i in range(num_weeks):
        if row['sub_end'] - pd.DateOffset(days=7*i) < row['sub_start']:
            feature.at[index, f'week{i}'] = np.nan
        else:   
            feature.at[index, f'week{i}'] = row['sub_end'] - pd.DateOffset(days=7*i)

# Retain only the history records that correspond to available feature ids
history = history[history['id'].isin(feature['id'])]

# Determine the latest date for 'week 12' (or the 12th weekly segment)
feature.sort_values(by='week12', ascending=True)

# Retain only the history records with a 'play_date' later than '2020-11-15'
history['play_date'] = pd.to_datetime(history['play_date'])
history = history[history['play_date'] > '2020-11-15']

#### Calculate weekly usage metrics (e.g., play time, frequency).

In [None]:
for i in range(1, 13): 
    print("week", i, "is processing")
    # Add new columns and initialize them with NaN
    feature[f'week{i}_avg_play_minute'] = np.nan 
    feature[f'week{i}_total_row_count'] = np.nan

    # Filter out null values and data containing valid ids
    mask = feature[(feature[f'week{i}'].notna()) & (feature[f'week{i-1}'].notna())]
    history1 = history[history['id'].isin(mask['id'])]

    # Filter data where 'id' is the same AND 'play_date' falls between week i and week i - 1.
    for index, row in tqdm(mask.iterrows()):
        subset = history1[(history1['id'] == row['id']) & (history1['play_date'] > row[f'week{i}']) & (history1['play_date'] <= row[f'week{i-1}'])]

        # Group by 'id' and calculate aggregated statistics.
        grouped = subset.groupby('id').agg(
            week_avg_play_minute = ('play_minute', 'mean'),
            row_count =('id', 'size'),
        ).reset_index()

        merged = pd.merge(feature, grouped, on='id', how='left')
        if not grouped.empty:
            feature.at[index, f'week{i}_avg_play_minute'] = grouped['week_avg_play_minute'].values[0]
            feature.at[index, f'week{i}_total_row_count'] = grouped['row_count'].values[0]
    print("week", i, "is done")

#### Handle missing values

In [None]:
feature['sub_start'] = pd.to_datetime(feature['sub_start'])
feature['sub_end'] = pd.to_datetime(feature['sub_end'])

# Calculate the number of weeks passed
feature['weeks_passed'] = np.minimum(12, ((feature['sub_end'] - feature['sub_start']).dt.days / 7).astype(int))


columns = feature.columns.tolist()
for index, row in feature.iterrows():
    weeks_passed = row['weeks_passed']
    # Fill NaN values in columns preceding the current week with 0
    for i in range(weeks_passed+1):
        for column in columns:
            if f'week{i}_' in column:
                if pd.isnull(row[column]):
                    feature.at[index, column] = 0

feature.fillna('none', inplace=True)
feature.drop(columns=['weeks_passed'], inplace=True)

#### Calculate daily average metrics

In [None]:
for i in range(1, 13):
    col_row_count = f"week{i}_total_row_count"
        
    for index, row in feature.iterrows():
        if isinstance(row[col_row_count], (int, float)):
            feature.at[index, f"week{i}_avg_row_count_by_day"] = row[col_row_count] / (7 * i)
            
feature.fillna('none', inplace=True)

#### Export

In [None]:
feature.to_csv('./data/feature_week.csv', index=False)