In [1]:
import pandas as pd
import numpy as np 
import plotnine as p9 

In [2]:
df = pd.read_csv("<PATH TO YOUR FILE>")

In [None]:
def start_pipeline(dataf):
    return dataf.copy()

def clean_dataset(dataf):
    dataf.columns = [c.lower().replace(" ", "") for c in dataf]
    return dataf.drop(columns="zone")

def parse_types(dataf):
    return (dataf
            .assign(charclass = lambda d: d['charclass'].str.lower())
            .assign(race = lambda d: d['race'].str.lower())
            .assign(timestamp = lambda d: pd.to_datetime(d['timestamp'], format="%m/%d/%y %H:%M:%S")))

clean_df = (df
 .pipe(start_pipeline)
 .pipe(clean_dataset)
 .pipe(parse_types))

In [None]:
def add_churn_label(dataf, before_period=("2008-01-01", "2008-03-01"), 
                    after_period=("2008-04-01", "2008-06-01"), min_rows=10):
    before_df = (dataf
                 .loc[lambda d: d['timestamp'] >= pd.to_datetime(before_period[0])]
                 .loc[lambda d: d['timestamp'] < pd.to_datetime(before_period[1])])

    after_df = (dataf
                 .loc[lambda d: d['timestamp'] >= pd.to_datetime(after_period[0])]
                 .loc[lambda d: d['timestamp'] < pd.to_datetime(after_period[1])])

    before_chars = (before_df
     .groupby("char")
     .count()
     .loc[lambda d: d['level'] > min_rows]
     .reset_index()['char'])

    after_chars = (after_df
     .groupby("char")
     .count()
     .reset_index()['char'])

    return (before_df
     .loc[lambda d: d['char'].isin(before_chars)]
     .assign(churned = lambda d: d['char'].isin(after_chars) == False))

def prep_ml_features(dataf):
    return (dataf
            .groupby(['char', 'churned', 'race', 'charclass'])
            .apply(lambda d: pd.Series({
                'guild': float((d['guild'] > 0).max()),
                'min_timestamp': d['timestamp'].iloc[0],
                'time_played': float(d.shape[0]), 
                'max_level': float(d['level'].max()),
                'min_level': float(d['level'].min())
            }))
            .reset_index()
            .assign(gamer = lambda d: (d['min_timestamp'] < pd.to_datetime("2008-01-01 05:00:00")).astype(float))
            .assign(level_speed = lambda d: (d['max_level'] - d['min_level'])/(d['time_played']))
            .assign(churned = lambda d: d['churned'].astype(float))
            .drop(columns=['char']))

ml_df = (clean_df
 .pipe(add_churn_label, min_rows=20)
 .pipe(prep_ml_features))

ml_df.head(2)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [None]:
color_function = {0.0: "blue", 1.0: "red"}
colors = ml_df['churned'].map(lambda x: color_function.get(x))
pltr = ml_df[['time_played', 'max_level', 'min_level', 'level_speed', 'gamer', 'guild']]

pd.plotting.scatter_matrix(pltr, c=colors, alpha = 0.2, figsize = (6, 6));

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

from sklego.datasets import load_chicken
from sklego.preprocessing import ColumnSelector

In [None]:
import itertools as it

In [None]:
pipe = Pipeline([
    ("grab", ColumnSelector(['time_played', 'max_level', 'min_level', 'level_speed', 'gamer', 'guild'])),
    ("standardize", StandardScaler()),
    ("model", KNeighborsClassifier())
])

In [None]:
"-".join(ColumnSelector(['time_played']).columns)

In [None]:
grab = [ColumnSelector(['time_played']), 
        ColumnSelector(['time_played', 'guild']), 
        ColumnSelector(['time_played', 'guild', 'gamer']), 
        ColumnSelector(['time_played', 'guild', 'gamer', 'max_level']), 
        ColumnSelector(['time_played', 'max_level', 'min_level', 'level_speed', 'gamer', 'guild'])]

models = [KNeighborsClassifier(n,  weights=d) for n, d in it.product(range(14, 18), ['uniform', 'distance'])]

mod = GridSearchCV(estimator=pipe, 
                   n_jobs=8,
                   return_train_score=True,
                   scoring={'precision': make_scorer(precision_score), 
                            'recall': make_scorer(recall_score)},
                   refit='precision',
                   cv=20, iid=True, 
                   param_grid={"model": models, 'grab': grab})

mod.fit(ml_df, ml_df['churned']);

In [None]:
output_df = (pd.DataFrame(mod.cv_results_)
             .assign(features=lambda d: [len(_.columns) for _ in d['param_grab']])
             .assign(n_neighbors=lambda d: [_.n_neighbors for _ in d['param_model']])
             .assign(knn_weights=lambda d: [_.weights for _ in d['param_model']]))

In [None]:
(p9.ggplot() + 
  p9.geom_point(data=output_df, 
                mapping=p9.aes("mean_test_precision", "std_test_precision", shape="knn_weights", color="features")))

In [None]:
mod.best_estimator_.steps[-1][1].n_neighbors

In [None]:
(output_df
 .assign(param_model=lambda d: [f"{_.n_neighbors}-{_.weights}" for _ in d['param_model']])
 .sort_values("rank_test_precision")
 .head(4)
 .T)