In [1]:
import polars as pl
import numpy as np

In [5]:
df = pl.read_parquet("wow-full.parquet")

In [6]:
df.head()

player_id,guild,level,race,class,where,datetime
i32,f64,i8,cat,cat,cat,datetime[ms]
46340,53.0,26,"""Orc""","""Hunter""","""Razorfen Kraul…",2007-06-12 03:17:48
31887,,10,"""Orc""","""Hunter""","""Durotar""",2007-06-12 03:17:48
47258,,15,"""Orc""","""Warrior""","""The Barrens""",2007-06-12 03:17:48
17448,,43,"""Orc""","""Hunter""","""Silverpine For…",2007-06-12 03:17:48
45159,104.0,57,"""Orc""","""Warlock""","""Winterspring""",2007-06-12 03:17:53


In [7]:
(df
 .with_columns(date=pl.col("datetime").dt.date())
 .group_by("date")
 .len()
 .plot("date", "len"))

In [8]:
df_target = (
    df
     .group_by("player_id")
     .agg(pl.col("datetime").max())
     .with_columns(target=pl.col("datetime").dt.year() > 2008)
     .drop("datetime")
)

Let's cause some disaster, shall we?

In [9]:
ml_df = (
    df
    .group_by("player_id")
    .agg(
        pl.col("level").max(), 
        pl.col("class").first(), 
        pl.col("race").first(),
        pl.col("datetime").len().alias("n_row"),
        pl.col("datetime").min().alias("min_dt")
    )
    .join(df_target, on="player_id")
    .filter(
        pl.col("n_row") > 10, 
        pl.col("min_dt").dt.year() < 2008)
    .drop("min_dt")
)

In [10]:
y = np.array(ml_df['target']).astype(int)
X = ml_df.drop("target", "player_id")
X.head()

level,class,race,n_row
i8,cat,cat,u32
70,"""Paladin""","""Blood Elf""",13766
17,"""Warlock""","""Undead""",38
70,"""Paladin""","""Blood Elf""",8357
41,"""Hunter""","""Orc""",661
1,"""Warrior""","""Orc""",24


In [14]:
from skrub import SelectCols
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [15]:
pipe = make_pipeline(
    make_union(
        make_pipeline(
            SelectCols(["class", "race"]),
            OneHotEncoder(handle_unknown="infrequent_if_exist")
        ),
        make_pipeline(
            SelectCols(["level"]),
        )
    ),
    LogisticRegression(max_iter=1_000)
)

In [16]:
import numpy as np

np.mean(pipe.fit(X, y).predict(X) == y)

0.9303152633602461

In [17]:
from sklearn.metrics import precision_score, accuracy_score, recall_score, make_scorer
from sklearn.model_selection import cross_validate

scorers = {
    "accuracy": make_scorer(accuracy_score), 
    "precision": make_scorer(precision_score), 
    # "recall": make_scorer(recall_score)
}
cross_validate(pipe, X, y, cv=5, scoring=scorers)

{'fit_time': array([0.08240533, 0.05641603, 0.03403306, 0.04501104, 0.04488516]),
 'score_time': array([0.0064199 , 0.00530672, 0.00515604, 0.00545287, 0.00459003]),
 'test_accuracy': array([0.93176358, 0.9292007 , 0.93079141, 0.92950977, 0.93095162]),
 'test_precision': array([1.        , 0.93333333, 1.        , 0.94117647, 1.        ])}

Ah yes. These numbers all give us confidence ... but alas ... this is a dataleak that can become a black hole. 

In [18]:
from datetime import datetime, timedelta

def dataset_generator(df, info_period=90, checking_period=30, start_date=datetime(2007, 1, 1), end_date=datetime(2007, 12, 31), step="1mo", time_col="datetime"):
    cutoff_start = pl.datetime_range(start_date, end_date, "1mo", eager=True).alias(time_col);
    
    for start in cutoff_start.to_list():
        train_info = df.filter(pl.col(time_col) < start, pl.col(time_col) < (start - timedelta(days=info_period)))
        valid_info = df.filter(pl.col(time_col) > start, pl.col(time_col) < (start + timedelta(days=checking_period)))
    
        target = valid_info.select("player_id").unique().with_columns(target=True)
    
        ml_df = (train_info
         .group_by("player_id")
         .agg(
             pl.col("race").first(), 
             pl.col("class").first(), 
             pl.col("level").max(), 
             pl.len())
         .join(target, on="player_id", how="left")
         .with_columns(target=pl.when(pl.col("target")).then(True).otherwise(False)))
        
        X = ml_df.drop("target", "player_id")
        y = np.array(ml_df["target"]).astype(int)
        
        yield X, y

pipe = make_pipeline(
    make_union(
        make_pipeline(
            SelectCols(["race", "class"]),
            OneHotEncoder(min_frequency=10, handle_unknown="ignore")
        ),
        SelectCols(["level", "len"])
    ),
    LogisticRegression(max_iter=1000)
)

for X, y in dataset_generator(df):
    scorers = {
        "accuracy": make_scorer(accuracy_score), 
        "precision": make_scorer(precision_score), 
        "recall": make_scorer(recall_score)
    }
    print(pl.DataFrame(cross_validate(pipe, X, y, cv=5, scoring=scorers)))
    

shape: (5, 5)
┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐
│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │
│ ---      ┆ ---        ┆ ---           ┆ ---            ┆ ---         │
│ f64      ┆ f64        ┆ f64           ┆ f64            ┆ f64         │
╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡
│ 0.213968 ┆ 0.006101   ┆ 0.897559      ┆ 0.617284       ┆ 0.214592    │
│ 0.11321  ┆ 0.005774   ┆ 0.890352      ┆ 0.528169       ┆ 0.160944    │
│ 0.161097 ┆ 0.005769   ┆ 0.895858      ┆ 0.622047       ┆ 0.169528    │
│ 0.07757  ┆ 0.00541    ┆ 0.893464      ┆ 0.566879       ┆ 0.190987    │
│ 0.178321 ┆ 0.005549   ┆ 0.892985      ┆ 0.565517       ┆ 0.175966    │
└──────────┴────────────┴───────────────┴────────────────┴─────────────┘
shape: (5, 5)
┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐
│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │
│ ---      ┆ ---       

In [20]:
from datetime import datetime, timedelta

def churn_dataset_generator(df, user_id, feature_pipeline, input_period=90, checking_period=30, start_date=datetime(2007, 1, 1), end_date=datetime(2007, 12, 31), step="1mo", time_col="datetime"):
    """
    Generates X,y pairs for churn related machine learning, with way less temporal data leaks to worry about. 

    Arguments:

    - df: a Polars dataframe that contains logs over time for users
    - user_id: the column name that depicts the user id
    - feature_pipeline: a Polars compatible function that generatres ML features to go in `X`
    - input_period: the number of days that the input period lasts
    - checking_period: the number of days that the checking period lasts
    - start_date: the start date for X,y-pair generation
    - end_date: the end date for X,y-pair generation
    - step: stepsize over time for new X,y-pairs. defaults to a month. 
    - time_col: column name that depicts the datetime stamp
    """
    cutoff_start = pl.datetime_range(start_date, end_date, step, eager=True).alias(time_col);
    
    for start in cutoff_start.to_list():
        train_info = df.filter(pl.col(time_col) < start, pl.col(time_col) < (start - timedelta(days=input_period)))
        valid_info = df.filter(pl.col(time_col) > start, pl.col(time_col) < (start + timedelta(days=checking_period)))
    
        target = valid_info.select("player_id").unique().with_columns(target=True)

        ml_df = (train_info
                 .pipe(feature_pipeline)
                 .join(target, on=user_id, how="left")
                 .with_columns(target=pl.when(pl.col("target")).then(True).otherwise(False)))
        
        X = ml_df.drop("target", "player_id")
        y = np.array(ml_df["target"]).astype(int)
        
        yield X, y

In [22]:
def feature_pipeline(dataf):
    return (dataf
             .group_by("player_id")
             .agg(
                 pl.col("race").first(), 
                 pl.col("class").first(),   
                 pl.col("level").max(), 
                 pl.len()))

pipe = make_pipeline(
    make_union(
        make_pipeline(
            SelectCols(["race", "class"]),
            OneHotEncoder(min_frequency=10, handle_unknown="ignore")
        ),
        SelectCols(["level", "len"])
    ),
    LogisticRegression(max_iter=1000)
)

gen = dataset_generator(df, user_id="player_id", feature_pipeline=feature_pipeline)

for X, y in gen:
    scorers = {
        "accuracy": make_scorer(accuracy_score), 
        "precision": make_scorer(precision_score), 
        "recall": make_scorer(recall_score)
    }
    print(pl.DataFrame(cross_validate(pipe, X, y, cv=5, scoring=scorers)))

shape: (5, 5)
┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐
│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │
│ ---      ┆ ---        ┆ ---           ┆ ---            ┆ ---         │
│ f64      ┆ f64        ┆ f64           ┆ f64            ┆ f64         │
╞══════════╪════════════╪═══════════════╪════════════════╪═════════════╡
│ 0.276897 ┆ 0.0059     ┆ 0.896601      ┆ 0.6            ┆ 0.218884    │
│ 0.18577  ┆ 0.007319   ┆ 0.891549      ┆ 0.546763       ┆ 0.16309     │
│ 0.153764 ┆ 0.00577    ┆ 0.897055      ┆ 0.652542       ┆ 0.165236    │
│ 0.077998 ┆ 0.005549   ┆ 0.892746      ┆ 0.554217       ┆ 0.197425    │
│ 0.190491 ┆ 0.005674   ┆ 0.89514       ┆ 0.601449       ┆ 0.178112    │
└──────────┴────────────┴───────────────┴────────────────┴─────────────┘
shape: (5, 5)
┌──────────┬────────────┬───────────────┬────────────────┬─────────────┐
│ fit_time ┆ score_time ┆ test_accuracy ┆ test_precision ┆ test_recall │
│ ---      ┆ ---       