In [1]:
# !pip install imblearn
# !pip install eli5

In [2]:
import time
from datetime import datetime, timedelta
import pandas as pd

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from scipy import interp
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import chi2, mutual_info_classif, RFECV
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, auc, \
                            log_loss, roc_auc_score, average_precision_score, confusion_matrix
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

import xgboost as xgb, lightgbm as lgbm, catboost as catb
import seaborn as sns

In [3]:
def time_format(sec):
    return str(timedelta(seconds=sec))

In [4]:
# Следует из исходных данных
CHURNED_START_DATE = '2019-09-01' 
CHURNED_END_DATE = '2019-10-01'

INTER_1 = (1,7)
INTER_2 = (8,14)
INTER_3 = (15,21)
INTER_4 = (22,28)
INTER_LIST = [INTER_1, INTER_2, INTER_3, INTER_4]

In [5]:
def evaluation(y_true, y_pred, y_prob):
    precision = precision_score(y_true=y_true, y_pred=y_pred)
    recall = recall_score(y_true=y_true, y_pred=y_pred)
    f1 = f1_score(y_true=y_true, y_pred=y_pred)
    ll = log_loss(y_true=y_true, y_pred=y_prob)
    roc_auc = roc_auc_score(y_true=y_true, y_score=y_prob)
    print('Precision: {}'.format(precision))
    print('Recall: {}'.format(recall))
    print('F1: {}'.format(f1))
    print('Log Loss: {}'.format(ll)) 
    print('ROC AUC: {}'.format(roc_auc)) 
    return precision, recall, f1, ll, roc_auc

def xgb_fit_predict(X_train, y_train, X_test, y_test):
    clf = xgb.XGBClassifier(max_depth=3,
                            n_estimators=100,
                            learning_rate=0.1,
                            nthread=5,
                            subsample=1.,
                            colsample_bytree=0.5,
                            min_child_weight = 3,
                            reg_alpha=0.,
                            reg_lambda=0.,
                            seed=42,
                            missing=1e10)

    clf.fit(X_train, y_train, eval_metric='aucpr', verbose=10)
    predict_proba_test = clf.predict_proba(X_test)
    predict_test = clf.predict(X_test)
    precision_test, recall_test, f1_test, log_loss_test, roc_auc_test = \
        evaluation(y_test, predict_test, predict_proba_test[:, 1])
    return clf

def plot_importance(importance, features):
    fi = pd.DataFrame(list(zip(features, importance))).sort_values(by=1, ascending=False)
    plt.figure(figsize=(16,6))
    plt.bar(range(fi.shape[0]), fi[1], align='center')
    plt.xticks(range(fi.shape[0]), fi[0], rotation=90)
    plt.title('name')
    plt.show()
    return fi

# lightgbm
def lightgbm_fit_predict(X_train, y_train, X_test, y_test):
    clf = lgbm.LGBMClassifier(random_state=21)
    clf.fit(X_train, y_train) # eval_metric='aucpr', verbose=10
    predict_proba_test = clf.predict_proba(X_test)
    predict_test = clf.predict(X_test)
    precision_test, recall_test, f1_test, log_loss_test, roc_auc_test = \
        evaluation(y_test, predict_test, predict_proba_test[:, 1])
    return clf

In [6]:
def group(df, columns):
    for i in columns:
        df[i]=df[i+'_1']+df[i+'_2']+df[i+'_3']+df[i+'_4']
        
        df[i+'_1_gr']=0
        df.loc[(df[i+'_1']>0), i+'_1_gr'] = 1
        
        df[i+'_2_gr']=0
        df.loc[(df[i+'_2']>0), i+'_2_gr'] = 1
        
        df[i+'_3_gr']=0
        df.loc[(df[i+'_3']>0), i+'_3_gr'] = 1
        
        df[i+'_4_gr']=0
        df.loc[(df[i+'_4']>0), i+'_4_gr'] = 1

In [7]:
def around(df, columns):
    for i in columns:
        df[i[0]+'_1']=np.round(df[i[0]+'_1'],i[1])
        df[i[0]+'_2']=np.round(df[i[0]+'_2'],i[1])
        df[i[0]+'_3']=np.round(df[i[0]+'_3'],i[1])
        df[i[0]+'_4']=np.round(df[i[0]+'_4'],i[1])

In [8]:
def get_3std(df, columns):
    t = []
    for i in columns:
        for j in range(1,5):
            mean = np.round(df[i+'_'+str(j)].mean())
            std3 = 3 * np.round(df[i+'_'+str(j)].std())
            conf_interval = mean + std3
            t.append(mean + std3)
    return t

def cutter(df, columns):
    for i in columns:
        m = get_3std(df, [i[0]])
        
        df.loc[(df[i[0]+'_1'] > m[0]), i[0]+'_1'] = df[i[0]+'_1'].mean()
        df.loc[(df[i[0]+'_2'] > m[1]), i[0]+'_2'] = df[i[0]+'_2'].mean()
        df.loc[(df[i[0]+'_3'] > m[2]), i[0]+'_3'] = df[i[0]+'_3'].mean()
        df.loc[(df[i[0]+'_4'] > m[3]), i[0]+'_4'] = df[i[0]+'_4'].mean()
        
def has_game(df, columns):
    for i in columns:
        for j in range(1,5):
            df[i+'_'+str(j)+'_b'] = 0
            df.loc[(df[i+'_'+str(j)] > 0), i+'_'+str(j)+'_b'] = 1
        df[i+'_b'] = 0
        df.loc[(df[i+'_1']>0)|(df[i+'_2']>0)|(df[i+'_3']>0)|(df[i+'_4']>0), i+'_b'] = 1 

In [9]:
def prepare_dataset(dataset, 
                    dataset_type='train',
                    dataset_path='dataset/'):
    print(dataset_type)
    start_t = time.time()
    print('Dealing with missing values, outliers, categorical features...')
    
    # Профили
    dataset['age'] = dataset['age'].fillna(dataset['age'].mean()) #mean?
    dataset.loc[(dataset['age'] > 70) | (dataset['age'] < 7), 'age'] = round(dataset['age'].mean())
    
    dataset['gender'] = dataset['gender'].fillna(dataset['gender'].mode()[0])
    dataset.loc[~dataset['gender'].isin(['M', 'F']), 'gender'] = dataset['gender'].mode()[0]
    dataset['gender'] = dataset['gender'].map({'M': 1, 'F':0})
    dataset['gender'] = dataset['gender'].astype(int)
    
    dataset['donate_total'] = np.around(dataset['donate_total'],decimals=2) #еще раз обрезать?
        
    dataset['donate_total_gr'] = 0
    dataset.loc[(dataset['donate_total']>0), 'donate_total_gr'] = 1
    
    dataset.loc[(dataset['level']>=10)&(dataset['level']<20) , 'level_gr'] = 1
    dataset.loc[(dataset['level']>=20)&(dataset['level']<30) , 'level_gr'] = 2
    dataset.loc[(dataset['level']>=30)&(dataset['level']<40) , 'level_gr'] = 3
    dataset.loc[(dataset['level']>=40)&(dataset['level']<=50) , 'level_gr'] = 4
    dataset['level_gr'] = dataset['level_gr'].astype(int)
    
    dataset.loc[(dataset['days_between_fl_df'] > 300) | (dataset['days_between_fl_df'] < -1), 'days_between_fl_df'] = \
                                                                round(dataset['days_between_fl_df'].mean())

    dataset.loc[(dataset['days_between_reg_fl'] > 300), 'days_between_reg_fl'] = \
                                                                round(dataset['days_between_reg_fl'].mean())

# обрезка портит
#     cutter(dataset, [['session_player',10000]])

# портит ,['silver_spent',3] ,['session_amt',0],['trans_amt',0]   

    around(dataset,[['sess_with_abusers_amt',0], ['session_player',2],['reports_amt',0],['disconnect_amt',0],
                    ['avg_min_ping',0],['kd',0],['leavings_rate',0],['win_rate',0],['gold_spent',0],
                    ['pay_amt',0]])

# портит     'session_amt''pay_amt''trans_amt' 'silver_spent'
    group(dataset,['sess_with_abusers_amt', 'session_player','reports_amt','disconnect_amt','avg_min_ping','kd',
                  'leavings_rate','win_rate','gold_spent'])
       
    # Пинги
    for period in range(1,len(INTER_LIST)+1):
        col = 'avg_min_ping_{}'.format(period)
        dataset.loc[(dataset[col] < 0) | 
                    (dataset[col].isnull()), col] = dataset.loc[dataset[col] >= 0][col].mean()
    # Сессии и прочее
    dataset.fillna(0, inplace=True)
    
#  портит   
#     has_game(dataset, ['session_player'])
    
    dataset.to_csv('{}dataset_{}.csv'.format(dataset_path, dataset_type, dataset_type), sep=';', index=False)
         
    print('Dataset is successfully prepared and saved to {}, run time (dealing with bad values): {}'.\
          format(dataset_path, time_format(time.time()-start_t)))

In [10]:
train = pd.read_csv('dataset/dataset_raw_train.csv', sep=';')

In [11]:
prepare_dataset(dataset=train, dataset_type='train')

train
Dealing with missing values, outliers, categorical features...
d:\GB course\cl-ser\train\dataset\dataset_train.csv
Dataset is successfully prepared and saved to d:\GB course\cl-ser\, run time (dealing with bad values): 0:00:56.791834


In [12]:
%%time
dataset = pd.read_csv('dataset/dataset_train.csv', sep=';')
X = dataset.drop(['user_id', 'is_churned'], axis=1)
y = dataset['is_churned']

X_mm = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_mm, 
                                                    y, 
                                                    test_size=0.3,
                                                    shuffle=True, 
                                                    stratify=y, 
                                                    random_state=100)

# Снизим дизбаланс классов
smote_on_1 = int(X_train.shape[0]*3/10) 
X_train_balanced, y_train_balanced = SMOTE(random_state=42, sampling_strategy={1: smote_on_1}). \
                                        fit_sample(X_train, y_train) 

print('До:', Counter(y_train.values))
print('После:', Counter(y_train_balanced))

До: Counter({0: 318955, 1: 9677})
После: Counter({0: 318955, 1: 98589})
Wall time: 28.1 s


In [13]:
%%time
fitted_clf = xgb_fit_predict(X_train_balanced, y_train_balanced, X_test, y_test)

Precision: 0.3310996563573883
Recall: 0.4645612343297975
F1: 0.38663723916532905
Log Loss: 0.12016098764676494
ROC AUC: 0.9145047856755335
Wall time: 46.7 s


In [14]:
test = pd.read_csv('dataset/dataset_raw_test.csv', sep=';')

In [15]:
prepare_dataset(dataset=test, dataset_type='test')

test
Dealing with missing values, outliers, categorical features...
d:\GB course\cl-ser\test\dataset\dataset_test.csv
Dataset is successfully prepared and saved to d:\GB course\cl-ser\, run time (dealing with bad values): 0:00:05.794155


In [16]:
%%time
dataset = pd.read_csv('dataset/dataset_test.csv', sep=';')
X = dataset.drop(['user_id'], axis=1)

X_test = StandardScaler().fit_transform(X)


Wall time: 862 ms


In [17]:
predict_test = fitted_clf.predict(X_test)
predict_proba_test = fitted_clf.predict_proba(X_test)

In [18]:
result = pd.concat([dataset['user_id'], pd.Series(predict_test)], axis=1)
result = result.rename(columns={0: 'is_churned'})
result.to_csv('IMarchenko_predictions.csv',index=None)