This script preprocesses subscription data and uses an LSTM model to predict user churn

In [None]:
import pandas as pd
import numpy as np
import torch
import imblearn
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from keras.models import load_model
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import data
data = pd.read_csv('./data/feature_week.csv')

# Create sub_days_group column
data['sub_days_group'] = np.where(data['sub_days'] >= 90, 1, 0)
data_copy = data.copy()

# One-Hot encoding
df_onehot = pd.get_dummies(data_copy, columns=['connect', 'platform', 'plan_type'])
df_onehot = df_onehot * 1  

# Delete redundant columns
df_onehot = df_onehot.drop(columns=['id', 'sub_start', 'sub_end', 'sub_start_new', 'unique_id', 'game', 'unsub_after_holiday', 'unsub_during_holiday', 'cover_vacation', 'week0', 'week1', 'week2', 'week3', 'week4', 'week5', 'week6', 'week7', 'week8', 'week9', 'week10', 'week11', 'week12'])

# Process NA
df_onehot.fillna(0, inplace=True)
df_onehot.replace(to_replace='none', value=0, inplace=True)

# Convert object to number
object_columns = df_onehot.select_dtypes(include=['object']).columns
for column in object_columns:
    df_onehot[column] = pd.to_numeric(df_onehot[column], errors='coerce')

In [None]:
# Define columns
columns_to_create = {
    'sub_before_vacation': 'week{}_sub_before_vacation',
    'sub_during_vacation': 'week{}_sub_during_vacation',
    'plan_type_年訂': 'week{}_plan_type_年訂',
    'plan_type_季訂': 'week{}_plan_type_季訂',
    'plan_type_月訂': 'week{}_plan_type_月訂',
    'sub_days': 'week{}_sub_days',
    'sub_days_group': 'week{}_sub_days_group',
    'casual': 'week{}_casual',
    'free': 'week{}_free',
    'adventure': 'week{}_adventure',
    'action': 'week{}_action',
    'multiplayer_tactical_competition': 'week{}_multiplayer_tactical_competition',
    'massive_multiplayer_online': 'week{}_massive_multiplayer_online',
    'platformer': 'week{}_platformer',
    'fighting': 'week{}_fighting',
    'simulation': 'week{}_simulation',
    'indie': 'week{}_indie',
    'racing': 'week{}_racing',
    'first_person_shooter': 'week{}_first_person_shooter',
    'strategy': 'week{}_strategy',
    'arcade': 'week{}_arcade',
    'role_playing': 'week{}_role_playing',
    'puzzle': 'week{}_puzzle',
    'sports': 'week{}_sports',
    'family': 'week{}_family',
    'connect_ethernet':'wee', 
    'connect_mobile':'week{}_connect_mobile',
    'connect_none':'week{}_connect_none',
    'connect_wifi':'week{}_connect_wifi',
    'platform_android':'week{}_platform_android',
    'platform_ios':'week{}_platform_ios',
    'platform_ipados':'week{}_platform_ipados',
    'platform_linux':'week{}_platform_linux',
    'platform_macos':'week{}_platform_macos',
    'platform_none':'week{}_platform_none',
    'platform_other':'week{}_platform_other',
    'platform_web':'week{}_platform_web',
    'platform_windows':'week{}_platform_windows'
}

for week in range(1, 13):
    for key, value in columns_to_create.items():
        df_onehot[value.format(week)] = df_onehot[key]

In [None]:
columns_to_scale = []

# Add relevant column by week
for i in range(1, 13):
    columns_to_scale.extend([
        f'week{i}_avg_play_minute',
        f'week{i}_avg_row_count_by_day',
        f'week{i}_sub_days'
    ])


# Transform
scaler = MinMaxScaler()
df_onehot[columns_to_scale] = scaler.fit_transform(df_onehot[columns_to_scale])

In [None]:
def generate_columns(variables, start_week, end_week):
    """
    Create new columns by week
    """
    columns = []
    for week in range(start_week, end_week + 1):
        prefix = f'week{week}_'
        columns.extend([prefix + var for var in variables])
    return columns

variables = ['plan_type_年訂','plan_type_季訂','plan_type_月訂',
             'sub_days','sub_days_group','sub_before_vacation', 'sub_during_vacation',
             'free', 'adventure', 'massive_multiplayer_online', 'first_person_shooter', 
             'avg_play_minute', 'avg_row_count_by_day']

# Generate columns
selected_columns = generate_columns(variables, 1, 12)

In [None]:
X = df_onehot
df_selected_x_test = X[selected_columns].copy()

X_test = np.array(df_selected_x_test).reshape(df_onehot.shape[0], 12, 13)

# Transform to PyTorch
X_tensor_test = torch.tensor(X_test, dtype=torch.float32)

In [None]:
# Import model
model = load_model('./data/model/lstm_model_G3.h5')

y_pred_prob = model.predict(X_tensor_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print(y_pred)
print(y_pred.shape)

print(data.shape)

# Ensure y_pred shape aligns with the DataFrame row count.
if len(y_pred) == len(data):
    data['y_pred'] = y_pred
else:
    print(f"Error: Length of predictions ({len(y_pred)}) does not match length of data ({len(data)})")

print(data)