In [1]:
import polars as pl
import numpy as np

Dataset link: [GitHub - koaning/wow-avatar-datasets](https://github.com/koaning/wow-avatar-datasets/blob/main/wow-full.parquet)

In [2]:
df = pl.read_parquet("wow-full.parquet")

In [3]:
df.head()

player_id,guild,level,race,class,where,datetime
i32,f64,i8,cat,cat,cat,datetime[ms]
46340,53.0,26,"""Orc""","""Hunter""","""Razorfen Kraul…",2007-06-12 03:17:48
31887,,10,"""Orc""","""Hunter""","""Durotar""",2007-06-12 03:17:48
47258,,15,"""Orc""","""Warrior""","""The Barrens""",2007-06-12 03:17:48
17448,,43,"""Orc""","""Hunter""","""Silverpine For…",2007-06-12 03:17:48
45159,104.0,57,"""Orc""","""Warlock""","""Winterspring""",2007-06-12 03:17:53


In [4]:
(df
 .with_columns(date=pl.col("datetime").dt.date())
 .group_by("date")
 .len()
 .plot("date", "len"))

Let us now move towards Machine Learning.

In [5]:
df_target = (df
 .group_by("player_id")
 .agg(pl.col("datetime").max())
 .with_columns(target=pl.col("datetime").dt.year() >= 2007 )
 .drop("datetime"))

In [6]:
ml_df = (df
 .group_by("player_id")
 .agg(
     pl.col("level").max(),
     pl.col("class").first(),
     pl.col("race").first(),
     pl.len().alias("n_row"),
 )
 .join(df_target, on="player_id")
 .drop("player_id")
 .filter(pl.col("n_row") > 10)
)

In [7]:
y = np.array(ml_df['target'])
X = ml_df.drop("target")
X.head()

level,class,race,n_row
i8,cat,cat,u32
18,"""Hunter""","""Troll""",106
57,"""Death Knight""","""Blood Elf""",16
8,"""Hunter""","""Troll""",30
70,"""Priest""","""Undead""",4289
12,"""Paladin""","""Blood Elf""",31


In [8]:
from skrub import SelectCols
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score

In [9]:
pipe = make_pipeline(
    make_union(
        make_pipeline(
            SelectCols(["class", "race"]),
            OneHotEncoder(handle_unknown="infrequent_if_exist")
        ),
        make_pipeline(
            SelectCols(["level"]),
        )
    ),
    LogisticRegression(max_iter=2_000)
)

In [10]:
import numpy as np

np.mean(pipe.fit(X, y).predict(X) == y)

0.8308071286730513

In [11]:
from sklearn.metrics import precision_score, accuracy_score, recall_score, make_scorer
from sklearn.model_selection import cross_validate

y = np.array(ml_df['target'])
X = ml_df.drop("target")

scorers = {
    "accuracy": make_scorer(accuracy_score), 
    "precision": make_scorer(precision_score), 
    "recall": make_scorer(recall_score)
}
cross_validate(pipe, X, y, cv=5, scoring=scorers)

{'fit_time': array([0.1542871 , 0.08812308, 0.21422696, 0.1478188 , 0.13812685]),
 'score_time': array([0.00765991, 0.00752473, 0.00674915, 0.00745916, 0.00724912]),
 'test_accuracy': array([0.83084077, 0.83084077, 0.83071509, 0.83081951, 0.83081951]),
 'test_precision': array([0.83084077, 0.83084077, 0.83071509, 0.83081951, 0.83081951]),
 'test_recall': array([1., 1., 1., 1., 1.])}

Ah yes. These numbers all give us confidence ... but alas ... this is a dataleak that can become a black hole. 

In [12]:
np.mean(y), y.shape

(0.8308071286730513, (39783,))

Let's write a safety mechanism now.

In [13]:
from datetime import datetime, timedelta

def churn_dataset_generator(df, user_id, feature_pipeline, 
                            info_period=180, 
                            checking_period=180, 
                            start_date=datetime(2007, 1, 1), 
                            end_date=datetime(2007, 12, 31), 
                            step="1mo", 
                            time_col="datetime"):
    """
    Generates X,y pairs for churn related machine learning, with way less temporal data leaks to worry about. 

    Arguments:

    - df: a Polars dataframe that contains logs over time for users
    - user_id: the column name that depicts the user id
    - feature_pipeline: a Polars compatible function that generatres ML features to go in `X`
    - input_period: the number of days that the input period lasts
    - checking_period: the number of days that the checking period lasts
    - start_date: the start date for X,y-pair generation
    - end_date: the end date for X,y-pair generation
    - step: stepsize over time for new X,y-pairs. defaults to a month. 
    - time_col: column name that depicts the datetime stamp
    """
    cutoff_start = pl.datetime_range(start_date, end_date, step, eager=True).alias(time_col)
    min_date = df[time_col].min()
    max_date = df[time_col].max()
    
    for start in cutoff_start.to_list():
        info_period_start = start - timedelta(days=info_period)
        checking_period_end = start + timedelta(days=checking_period)
        if info_period_start < min_date:
            continue
        if checking_period_end > max_date:
            continue
        print(info_period_start, start, checking_period_end, min_date, max_date)
        train_info = df.filter(pl.col(time_col) < start, pl.col(time_col) >= (start - timedelta(days=info_period)))
        valid_info = df.filter(pl.col(time_col) >= start, pl.col(time_col) < (start + timedelta(days=checking_period)))
        
   
        target = valid_info.select("player_id").unique().with_columns(target=True)

        ml_df = (train_info
                 .pipe(feature_pipeline)
                 .join(target, on=user_id, how="left")
                 .with_columns(target=pl.when(pl.col("target")).then(True).otherwise(False)))
        
        X = ml_df.drop("target", "player_id")
        y = np.array(ml_df["target"]).astype(int)
        
        yield X, y

Now, when we run the aggregation, we won't steal data from the future that we can't use

In [14]:
def feature_pipeline(dataf):
    return (dataf
             .group_by("player_id")
             .agg(
                 pl.col("race").first(), 
                 pl.col("class").first(),   
                 pl.col("level").max(), 
                 pl.len().alias("n_row")))

gen = churn_dataset_generator(df, user_id="player_id", info_period=120, checking_period=120, feature_pipeline=feature_pipeline)

for X, y in gen:
    scorers = {
        "accuracy": make_scorer(accuracy_score), 
        "precision": make_scorer(precision_score), 
        "recall": make_scorer(recall_score)
    }
    print(pl.DataFrame(cross_validate(pipe, X, y, cv=5, scoring=scorers)))

2006-09-03 00:00:00 2007-01-01 00:00:00 2007-05-01 00:00:00 2005-12-31 23:59:46 2009-01-10 05:08:59
shape: (5, 5)
┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐
│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │
│ ---      ┆ ---        ┆ ---           ┆ ---            ┆ ---         │
│ f64      ┆ f64        ┆ f64           ┆ f64            ┆ f64         │
╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡
│ 0.027518 ┆ 0.003589   ┆ 0.700935      ┆ 0.762815       ┆ 0.639476    │
│ 0.026381 ┆ 0.003645   ┆ 0.708333      ┆ 0.767123       ┆ 0.652586    │
│ 0.023318 ┆ 0.003554   ┆ 0.702882      ┆ 0.767075       ┆ 0.638019    │
│ 0.021533 ┆ 0.004087   ┆ 0.716511      ┆ 0.768719       ┆ 0.672489    │
│ 0.025888 ┆ 0.003689   ┆ 0.718458      ┆ 0.77892        ┆ 0.661572    │
└──────────┴────────────┴───────────────┴────────────────┴─────────────┘
2006-10-04 00:00:00 2007-02-01 00:00:00 2007-06-01 00:00:00 2005-12-31 23:59:46 200