In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import sklearn, sklearn.linear_model, sklearn.ensemble

import modutils

data_dir = '../DataSets/Churn/'
logm_fmt = data_dir + 'user_logs/uldtm_{0}.csv'
used_log = '201701'

In [94]:
src0 = pd.read_csv(logm_fmt.format(used_log))

In [58]:
features = ['used', 'ln_tot', 'ln_sec', 'pct_low', 'pct_high', 'pct_100', 'pct_unq', 'avg_sec_n']
target = 'nxt_used'
devX = src0[~np.isnan(src0[target])][features].values
devY = src0[~np.isnan(src0[target])][target].values

(trainX, trainY),(validX, validY) = modutils.splitSample((devX, devY),pcts=[0.7,0.3])

def eval_model(m, tX, tY):
    tP = m.predict_proba(tX)[:,1]
    tL = np.log(tP / (1-tP))
    return sklearn.metrics.roc_auc_score(tY, tL)*2-1

def set_prefixes(prefix):
    return {'used_sum':prefix+'_used','used_mean':prefix+'_avg_used', 'f_lgt_mean':prefix+'_lgt_mean', 'f_lgt_min':prefix+'_lgt_min', 'f_lgt_max':prefix+'_lgt_max'}

In [15]:
%%time
m_logreg = sklearn.linear_model.LogisticRegression().fit(trainX, trainY)

Wall time: 719 ms


In [17]:
gini_t = eval_model(m_logreg, trainX, trainY)
gini_v = eval_model(m_logreg, validX, validY)
print('LogReg: train - {0:.3f}, valid - {1:.3f}'.format(gini_t, gini_v))

LogReg: train - 0.586, valid - 0.587


In [18]:
%%time
m_gb01 = sklearn.ensemble.GradientBoostingClassifier().fit(trainX, trainY)

Wall time: 36.2 s


In [19]:
gini_t = eval_model(m_gb01, trainX, trainY)
gini_v = eval_model(m_gb01, validX, validY)
print('GradBoost (default): train - {0:.3f}, valid - {1:.3f}'.format(gini_t, gini_v))

GradBoost (default): train - 0.591, valid - 0.591


In [20]:
%%time
m_gb02 = sklearn.ensemble.GradientBoostingClassifier(max_depth=5).fit(trainX, trainY)

Wall time: 1min 7s


In [21]:
gini_t = eval_model(m_gb02, trainX, trainY)
gini_v = eval_model(m_gb02, validX, validY)
print('GradBoost (depth=5): train - {0:.3f}, valid - {1:.3f}'.format(gini_t, gini_v))

GradBoost (depth=5): train - 0.595, valid - 0.590


In [22]:
m_gb01.feature_importances_

array([ 0.        ,  0.0528529 ,  0.30351616,  0.06045091,  0.08258534,
        0.2086368 ,  0.07210676,  0.21985112])

In [23]:
m_gb02.feature_importances_

array([  7.20952763e-07,   6.70972353e-02,   3.10718574e-01,
         4.99309199e-02,   6.21331482e-02,   1.97939096e-01,
         7.53295736e-02,   2.36850731e-01])

In [95]:
src_p = m_logreg.predict_proba(src0[features])[:,1]
src0['f_lgt'] = np.log(src_p/(1-src_p))

In [96]:
srcg05 = src0[(src0.date>20170125)&(src0.date<=20170130)].groupby('uid').agg({'used':['sum','mean'], 'f_lgt':['mean','min', 'max']})
srcg05.columns = ['_'.join(x) for x in srcg05.columns.ravel()]
srcg05 = srcg05.rename(columns=set_prefixes('g05'))

srcg15 = src0[(src0.date>20170115)&(src0.date<=20170125)].groupby('uid').agg({'used':['sum','mean'], 'f_lgt':['mean','min', 'max']})
srcg15.columns = ['_'.join(x) for x in srcg15.columns.ravel()]
srcg15 = srcg15.rename(columns=set_prefixes('g15'))

srcg30 = src0[(src0.date<=20170115)].groupby('uid').agg({'used':['sum','mean'], 'f_lgt':['mean','min', 'max']})
srcg30.columns = ['_'.join(x) for x in srcg30.columns.ravel()]
srcg30 = srcg30.rename(columns=set_prefixes('g30'))

srcgt = src0[src0.date==20170131][['uid','used']]

In [97]:
len(srcg05), len(srcg15), len(srcg30), len(srcgt)

(1136003, 1136003, 1136003, 1136003)

In [122]:
srcg05['uid'] = srcg05.index
srcg15['uid'] = srcg15.index
srcg30['uid'] = srcg30.index

In [123]:
s1 = pd.merge(srcgt, srcg05, how='inner', on='uid')
s2 = pd.merge(s1, srcg15, how='inner', on='uid')
srcgg = pd.merge(s2, srcg30, how='inner', on='uid')

In [126]:
gfeatures1 = ['_used','_avg_used','_lgt_mean','_lgt_min','_lgt_max']
gfeatures2 = ['g05', 'g15', 'g30']
gfeatures = [x+y for x in gfeatures2 for y in gfeatures1]
gtarget = 'used'

gdevX = srcgg[gfeatures].values
gdevY = srcgg[gtarget].values

(gtrainX, gtrainY),(gvalidX, gvalidY) = modutils.splitSample((gdevX, gdevY),pcts=[0.7,0.3])

In [127]:
%%time
gm_logreg = sklearn.linear_model.LogisticRegression().fit(gtrainX, gtrainY)

Wall time: 7.16 s


In [128]:
gini_t = eval_model(gm_logreg, gtrainX, gtrainY)
gini_v = eval_model(gm_logreg, gvalidX, gvalidY)
print('LogReg: train - {0:.3f}, valid - {1:.3f}'.format(gini_t, gini_v))

LogReg: train - 0.695, valid - 0.695


In [129]:
%%time
gm_gb01 = sklearn.ensemble.GradientBoostingClassifier().fit(gtrainX, gtrainY)

Wall time: 4min 19s


In [130]:
gini_t = eval_model(gm_gb01, gtrainX, gtrainY)
gini_v = eval_model(gm_gb01, gvalidX, gvalidY)
print('GradBoost: train - {0:.3f}, valid - {1:.3f}'.format(gini_t, gini_v))

GradBoost: train - 0.719, valid - 0.719
