In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time 
%matplotlib inline

from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [44]:
"""
    Step 0 : Get Data
    
"""

def GetData(data_path):
    operation_train = pd.read_csv(data_path + "operation_train_new.csv")
    transaction_train = pd.read_csv(data_path + "transaction_train_new.csv")
    operation_test = pd.read_csv(data_path + "operation_round1_new.csv")
    transaction_test = pd.read_csv(data_path + "transaction_round1_new.csv")
    tag_train = pd.read_csv(data_path + "tag_train_new.csv")
    tag_test = pd.read_csv("../Data/submission_example.csv")
    return operation_train, transaction_train, tag_train, operation_test, transaction_test, tag_test

In [380]:
""" 
    Step 1 : Data Preprocessing 

    Drop Duplicates
    Fill Na
    Drop Outliers: After Data Exploration 
    
"""

def CreateTagTest(operation_test, transaction_test):
    uids = pd.concat([operation_test["UID"], transaction_test["UID"]]).unique()
    uids = np.sort(uids)
    tag_test = pd.DataFrame()
    tag_test["UID"] = uids
    tag_test["Tag"] = -1
    return tag_test

def MergeData(operation, transaction, tag):
    uid_tag_dict = dict(zip(tag["UID"], tag["Tag"]))
    operation["tag"] = operation["UID"].map(uid_tag_dict)
    transaction["tag"] = transaction["UID"].map(uid_tag_dict)
    return operation, transaction

def DropDuplicates(data):
    print("before: ")
    print("data.shape: ", data.shape)
    data = data.drop_duplicates()
    print("after: ")
    print("data.shape: ", data.shape)
    return data

def DataPreprocessing(operation_train, transaction_train, tag_train, operation_test, transaction_test):
    tag_test = CreateTagTest(operation_test, transaction_test)

    operation_train, transaction_train = MergeData(operation_train, transaction_train, tag_train)
    operation_test, transaction_test = MergeData(operation_test, transaction_test, tag_test)

    operation_train = DropDuplicates(operation_train)
    transaction_train = DropDuplicates(transaction_train)
    operation_test = DropDuplicates(operation_test)
    transaction_test = DropDuplicates(transaction_test)
    return operation_train, transaction_train, tag_train, operation_test, transaction_test, tag_test

In [381]:
"""
    Step 2 : Data Exploration
    
    Categorical Columns : countplot, barplot
    Numerical Columns : regplot
    
"""

def DataExploration(operation, transaction, train_mode = False):
    
    print("Operation Countplot")
    plt.figure(figsize = (15, 10))
    i = 1
    for column in operation.columns:
        if operation[column].unique().shape[0] < 50:
            print("Plot " + column + "...")
            plt.subplot(2, 3, i)
            sns.countplot(x = column, data = operation)
            i += 1
    plt.suptitle("Operation Countplot")
    plt.show()

    if train_mode:
        print("#" * 100)
        print("Operation Barplot")
        plt.figure(figsize = (15, 10))
        i = 1
        for column in operation.columns:
            if operation[column].unique().shape[0] < 50 and column != "tag":
                print("Plot " + column + "...")
                plt.subplot(2, 2, i)
                sns.barplot(x = column, y = "tag", data = operation.sample(100000))
                i += 1
        plt.suptitle("Operation Barplot")
        plt.show()
    
    print("#" * 100)
    print("Transaction Countplot")
    plt.figure(figsize = (15, 10))
    i = 1
    for column in transaction.columns:
        if transaction[column].unique().shape[0] < 50:
            print("Plot " + column + "...")
            plt.subplot(2, 4, i)
            sns.countplot(x = column, data = transaction)
            i += 1
    plt.suptitle("Transaction Countplot")
    plt.show()

    if train_mode:
        print("#" * 100)
        print("Transaction Barplot")
        plt.figure(figsize = (15, 10))
        i = 1
        for column in transaction.columns:
            if transaction[column].unique().shape[0] < 50 and column != "tag":
                print("Plot " + column + "...")
                plt.subplot(2, 3, i)
                sns.barplot(x = column, y = "tag", data = transaction.sample(100000))
                i += 1
        plt.suptitle("transaction Barplot")
        plt.show()

# operation
# 时间日期：day, time
# 操作：mode, success
# 操作系统：os
# 版本：version
# 设备：device1, device2, device_code1, device_code2, device_code3
# ip：ip1, ip2, ip1_sub, ip2_sub
# mac：mac1, mac2
# wifi：wifi
# 地理位置：geo_code

# transaction
# 平台：channel
# 日期时间：day, time
# 资金：trans_amt, bal, amt_src1, amt_src2
# 商户：merchant, code1, code2
# 交易类型：trans_type1, trans_typ2
# 账户：acc_id1, acc_id2
# 设备：device_code1, device_code2, device_code3, device1, device2
# ip：ip1, ip1_sub
# mac：mac1
# 地理位置：geocode
# 营销活动：market_code, market_type

# operation countplot结论
# 1 日期分布较为均匀，操作记录较多的日期有1，8，15，22，29
# 2 绝大多数操作记录都成功了
# 3 操作系统分布不均，最多的是102，最少的是101和107
# 4 版本中，操作记录最多的是7.0.9和7.0.5
# 5 黑白样本记录不平衡，黑记录 / 白记录 = 137371 / 1096976 = 0.125；黑白样本比例不平衡，黑样本 / 白样本 = 4285 / 26894 = 0.16

# operation barplot结论
# 1 日期分布较为均匀，27附近的黑样本比例较高
# 2 操作成功的黑样本比例较高
# 3 操作系统为107的样本均为白样本，105的样本均为黑样本
# 4 版本为6.1.0，4.1.7的样本均为黑样本，6.5.0，7.0.0，6.6.3的黑样本比例较高，许多版本的样本均为白样本

In [382]:
"""
    Step 3 : Feature Engineering
    
    Feature Creation
    Feature Selection
    
"""

def FeatureCreation(data, tag):
    data["hour"] = data["time"].apply(lambda x : int(x[:2]))
    data["minute"] = data["time"].apply(lambda x : int(x[3:5]))
    data["second"] = data["time"].apply(lambda x : int(x[6:]))
    uid_count = data.groupby("UID").count()["tag"].reset_index().rename(columns = {"tag" : "uid_count"})
    data_features = pd.merge(left = tag, right = uid_count, on = "UID", how = "outer")
    for column in data.columns:
        if column != "UID" and column != "tag":
            print("Create " + column + "_nunique...")
            column_nunique = data.groupby("UID").nunique()[column].reset_index().rename(columns = {column : column + "_nunique"})
            data_features = pd.merge(left = data_features, right = column_nunique, on = "UID", how = "outer")
            if data[column].nunique() < 50:
                column_values = data[column].unique()
                for value in column_values:
                    if str(value) != "nan":
                        print("Create " + column + "_" + str(value) + "_count...")
                        column_value = data[data[column] == value]
                        column_value_count = column_value.groupby("UID").count()[column].reset_index().rename(columns = {column : column + "_" + str(value) + "_count"})
                        data_features = pd.merge(data_features, column_value_count, on = "UID", how = "outer")
    return data_features

def FeatureMerge(operation_features, transaction_features):
    features = pd.merge(operation_features, transaction_features, on = "UID", how = "outer")
    x = features.drop(["UID", "Tag_x", "Tag_y"], axis = 1)
    y = features["Tag_x"]
    return x, y

def FeatureFillNa(x):
    x = x.fillna(-1)
    return x

def FeatureSelection(x_train, y_train, x_test):
    sfm = SelectFromModel(GradientBoostingClassifier())
    sfm.fit(x_train, y_train)
    support = sfm.get_support()
    indices = list(range(len(support)))
    selected_indices = [index for index in indices if support[index]]
    selected_features = x_train.columns.values[selected_indices]
    x_train = x_train.loc[:, selected_features]
    x_temp = pd.DataFrame(columns = selected_features)
    for feature in selected_features:
        if feature in x_test.columns:
            x_temp[feature] = x_test[feature]
    x_test = x_temp
    return x_train, x_test

def FeatureEngineering(operation_train, transaction_train, tag_train, operation_test, transaction_test, tag_test):
    operation_train_features = FeatureCreation(operation_train, tag_train)
    transaction_train_features = FeatureCreation(transaction_train, tag_train)
    operation_test_features = FeatureCreation(operation_test, tag_test)
    transaction_test_features = FeatureCreation(transaction_test, tag_test)

    x_train, y_train = FeatureMerge(operation_train_features, transaction_train_features)
    x_test, y_test = FeatureMerge(operation_test_features, transaction_test_features)

    x_train = FeatureFillNa(x_train)
    x_test = FeatureFillNa(x_test)

#     x_train, x_test = FeatureSelection(x_train, y_train, x_test)
    return x_train, y_train, x_test

In [383]:
"""
    Step 4 : Model Optimization
    
    Models : lr, gbdt, xgb, lgbm
    
"""

def ModelOptimization(model, params, x_train, y_train):
    best_params = []
#     cv = GridSearchCV(estimator = model, param_grid = params, scoring = "roc_auc", cv = 3, n_jobs = -1)
#     cv.fit(x_train, y_train)
    for param in params:
        print("Optimize param", param, "...")
        cv = GridSearchCV(estimator = model, param_grid = param, scoring = "roc_auc", cv = 3, n_jobs = -1)
        cv.fit(x_train, y_train)
        best_params.append(cv.best_params_)
    return best_params

In [70]:
"""
    Step 5 : Model Evaluation
    
"""

def tpr_weight_function(y_true, y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

def ModelEvaluation(model, x_train, y_train):
    # roc_auc
    print("Compute roc_auc_score...")
    roc_auc = np.mean(cross_val_score(estimator = model, 
                                         X = x_train, 
                                         y = y_train, 
                                         scoring = "roc_auc", 
                                         cv = 3, 
                                         n_jobs = -1, 
                                         verbose = 10))

    # tpr_weight
    print("Compute tpr_weight_score...")
    kf = KFold(n_splits = 3)
    model_scores = []
    for train_index, test_index in kf.split(x_train):
        print("Split data...")
        x_tr, x_te = x_train.values[train_index], x_train.values[test_index]
        y_tr, y_te = y_train.values[train_index], y_train.values[test_index]
        model.fit(x_tr, y_tr)
        y_pred = model.predict(x_te)
        score = tpr_weight_function(y_te, y_pred)
        model_scores.append(score)
    tpr_weight = np.mean(model_scores)
    
    return roc_auc, tpr_weight

def Record(x_train, model, roc_auc_score, tpr_weight_score):
    with open(RECORDS_PATH + "record.txt", "a") as f:
        f.write("features:\t")
        f.write("[" + ", ".join(x_train.columns.values) + "]")
        f.write("\n\n")
        f.write("model:\t")
        f.write(str(gbdt))
        f.write("\n\n")
        f.write("roc_auc_score:\t")
        f.write(str(roc_auc_score))
        f.write("\n\n")
        f.write("tpr_weight_score:\t")
        f.write(str(tpr_weight_score))
        f.write("\n")
        f.write("#" * 100)
        f.write("\n")

In [385]:
"""
    Step 6 : Predict and Submit
    
"""

def PredictSubmit(model, x_train, y_train, x_test, tag_test, file_name):
    model.fit(x_train, y_train)
    y_pred = model.predict_proba(x_test)[:, 1]
    submission = pd.DataFrame()
    submission["UID"] = tag_test["UID"]
    submission["Tag"] = y_pred
    submission.to_csv(SUBMISSION_PATH + file_name, index = False)
    return y_pred, submission

In [386]:
"""
    Step 7 : Ensembling Predict Submit

"""

def EnsemblingPredictSubmit(y_pred_list, tag_test, file_name):
    y_pred = np.array(y_pred_list).mean(axis = 0)
    submission = pd.DataFrame()
    submission["UID"] = tag_test["UID"]
    submission["Tag"] = y_pred
    submission.to_csv(SUBMISSION_PATH + file_name, index = False)
    return y_pred, submission

In [387]:
# Constant Variables
DATA_PATH = "../Data/"
SUBMISSION_PATH = "../Submission/"
RECORDS_PATH = "../Records/"
RANDOM_STATE = 2018

In [388]:
# Get Data
operation_train, transaction_train, tag_train, operation_test, transaction_test = GetData(DATA_PATH)
# operation_train, transaction_train, operation_test, transaction_test = operation_train[:10000], transaction_train[:10000], operation_test[:10000], transaction_test[:10000]

  if (yield from self.run_code(code, result)):


In [389]:
# Data Preprocessing
operation_train, transaction_train, tag_train, operation_test, transaction_test, tag_test = DataPreprocessing(operation_train, transaction_train, tag_train, operation_test, transaction_test)

before: 
data.shape:  (1460843, 21)
after: 
data.shape:  (1234347, 21)
before: 
data.shape:  (264654, 28)
after: 
data.shape:  (264622, 28)
before: 
data.shape:  (1769049, 21)
after: 
data.shape:  (1578036, 21)
before: 
data.shape:  (168981, 28)
after: 
data.shape:  (168452, 28)


In [390]:
# Data Exploration
# DataExploration(operation_train, transaction_train, train_mode = True)
# DataExploration(operation_test, transaction_test, train_mode = False)

In [391]:
# Feature Engineering
x_train, y_train, x_test = FeatureEngineering(operation_train, transaction_train, tag_train, operation_test, transaction_test, tag_test)

Create day_nunique...
Create day_30_count...
Create day_16_count...
Create day_8_count...
Create day_23_count...
Create day_26_count...
Create day_27_count...
Create day_19_count...
Create day_5_count...
Create day_13_count...
Create day_10_count...
Create day_1_count...
Create day_3_count...
Create day_20_count...
Create day_18_count...
Create day_22_count...
Create day_28_count...
Create day_12_count...
Create day_4_count...
Create day_15_count...
Create day_9_count...
Create day_6_count...
Create day_2_count...
Create day_25_count...
Create day_21_count...
Create day_11_count...
Create day_17_count...
Create day_29_count...
Create day_24_count...
Create day_7_count...
Create day_14_count...
Create mode_nunique...
Create success_nunique...
Create success_1.0_count...
Create success_0.0_count...
Create time_nunique...
Create os_nunique...
Create os_102_count...
Create os_200_count...
Create os_103_count...
Create os_101_count...
Create os_107_count...
Create os_104_count...
Create os_

Create time_nunique...
Create os_nunique...
Create os_102_count...
Create os_103_count...
Create os_200_count...
Create os_101_count...
Create os_107_count...
Create os_104_count...
Create os_105_count...
Create version_nunique...
Create version_7.0.9_count...
Create version_7.1.3_count...
Create version_7.0.5_count...
Create version_7.0.7_count...
Create version_7.1.2_count...
Create version_7.0.2_count...
Create version_6.6.2_count...
Create version_7.0.0_count...
Create version_4.1.7_count...
Create version_6.6.3_count...
Create version_5.8.21_count...
Create version_6.0.4_count...
Create version_1.3.0_count...
Create version_6.0.5_count...
Create version_7.0.1_count...
Create version_1.0.0_count...
Create version_1.2.0_count...
Create version_1.1.0_count...
Create version_6.6.0_count...
Create version_6.5.0_count...
Create version_6.1.0_count...
Create version_5.8.24_count...
Create version_5.8.15_count...
Create version_0.0.2_count...
Create version_7.1.0_count...
Create version_5

In [None]:
# Model Optimization
# params can be set as constant variable at the head of program
lr = LogisticRegression(random_state = RANDOM_STATE)
lr_params = [{"C": [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10]}, 
             {"class_weight": [None, "balanced"]}, 
             {"max_iter": [100, 300, 500, 1000]}, 
             {"penalty": ["l1", "l2"]},
             {"solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}]
lr_best_params = ModelOptimization(lr, lr_params, x_train, y_train)

gbdt = GradientBoostingClassifier(random_state = RANDOM_STATE)
gbdt_params = [{"n_estimators": [100, 300, 500, 1000]}, 
               {"learning_rate": [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]}, 
               {"max_features": [None, "log2", "sqrt"]}, 
               {"max_depth": [3, 5, 7, 9]}, 
               {"min_samples_split": [2, 4, 6, 8]}, 
               {"min_samples_leaf": [1, 3, 5, 7]}]
gbdt_best_params = ModelOptimization(gbdt, gbdt_params, x_train, y_train)
                
xgb = XGBClassifier(random_state = RANDOM_STATE)
xgb_params = [{"learning_rate": [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1]}, 
              {"n_estimators": [100, 300, 500, 1000]}, 
              {"max_depth": range(3,10,2)}, 
              {"min_child_weight": range(1,6,2)}, 
              {"gamma": [i/10.0 for i in range(0,5)]}, 
              {"subsample": [i/10.0 for i in range(6,10)]},
              {"colsample_bytree": [i/10.0 for i in range(6,10)]}, 
              {"reg_alpha": [1e-5, 1e-2, 0.1, 1, 100]}]
xgb_best_params = ModelOptimization(xgb, xgb_params, x_train, y_train)

Optimize param {'C': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10]} ...




Optimize param {'class_weight': [None, 'balanced']} ...




Optimize param {'max_iter': [100, 300, 500, 1000]} ...


In [None]:
# Model Evaluation
lr = LogisticRegression(C = 3.0, class_weight = "balanced", max_iter = 100, penalty = "l2", solver = "newton-cg")
lr_roc_auc_score, lr_tpr_weight_score = ModelEvaluation(lr, x_train, y_train)
Record(x_train, lr, lr_roc_auc_score, lr_tpr_weight_score)

gbdt = GradientBoostingClassifier(n_estimators = 1000, learning_rate = 0.3, max_features = None, 
                                  max_depth = 5, min_samples_split = 6, min_samples_leaf = 1)
gbdt_roc_auc_score, gbdt_tpr_weight_score = ModelEvaluation(gbdt, x_train, y_train)
Record(x_train, gbdt, gbdt_roc_auc_score, gbdt_tpr_weight_score)

xgb = XGBClassifier(n_estimators = 500, learning_rate = 0.3, max_depth = 9, min_child_weight = 5, 
                    gamma = 0.3, subsample = 0.7, colsample_bytree = 0.7, reg_alpha = 1e-05)
xgb_roc_auc_score, xgb_tpr_weight_score = ModelEvaluation(xgb, x_train, y_train)
Record(x_train, xgb, xgb_roc_auc_score, xgb_tpr_weight_score)

In [None]:
# Model Ensemble 
# Record the feature, model, params, score

In [None]:
# Predict and Submit
# lr_pred, lr_submission = PredictSubmit(lr, x_train, y_train, x_test, tag_test, "lr.csv")
# gbdt_pred, gbdt_submission = PredictSubmit(gbdt, x_train, y_train, x_test, tag_test, "gbdt.csv")
# xgb_pred, xgb_submission = PredictSubmit(xgb, x_train, y_train, x_test, tag_test, "xgb.csv")

# No Feature Selection
lr_pred, lr_submission = PredictSubmit(lr, x_train, y_train, x_test, tag_test, "lr_no_feature_selection.csv")
gbdt_pred, gbdt_submission = PredictSubmit(gbdt, x_train, y_train, x_test, tag_test, "gbdt_no_feature_selection.csv")
xgb_pred, xgb_submission = PredictSubmit(xgb, x_train, y_train, x_test, tag_test, "xgb_no_feature_selection.csv")

In [None]:
# Ensembling Predict and Submit
# ensemble_pred, ensemble_submission = EnsemblingPredictSubmit([lr_pred, gbdt_pred, xgb_pred], tag_test, "ensemble.csv")

# No Feature Selection
ensemble_pred, ensemble_submission = EnsemblingPredictSubmit([lr_pred, gbdt_pred, xgb_pred], tag_test, "ensemble_no_feature_selection.csv")