In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time 
%matplotlib inline

from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [2]:
"""
    Step 0 : Get Data
    
"""

def GetData(data_path):
    print("Get Data Start")
    operation_train = pd.read_csv(data_path + "operation_train_new.csv")
    transaction_train = pd.read_csv(data_path + "transaction_train_new.csv")
    operation_test = pd.read_csv(data_path + "operation_round1_new.csv")
    transaction_test = pd.read_csv(data_path + "transaction_round1_new.csv")
    tag_train = pd.read_csv(data_path + "tag_train_new.csv")
    tag_test = pd.read_csv("../Data/submission_example.csv")
    print("Get Data Done")
    return operation_train, transaction_train, tag_train, operation_test, transaction_test, tag_test

In [3]:
""" 
    Step 1 : Data Preprocessing 

    Drop Duplicates
    Drop Outliers: After Data Exploration 
    
"""

def DataPreprocessing(data):
    print("Data Preprocessing Start")
    data = data.drop_duplicates()
    print("Data Preprocessing Done")
    return data

In [4]:
"""
    Step 2 : Data Exploration
    
    Categorical Columns : countplot, barplot
    Numerical Columns : regplot
    
"""

def DataExploration(data, tag = None):
    print("Data Exploration Start")
    print("Countplot")
    plt.figure(figsize = (15, 10))
    i = 1
    for column in data.columns:
        if column != "UID" and data[column].nunique() < 50:
            plt.subplot(2, 4, i)
            sns.countplot(x = column, data = data)
            i += 1
    plt.suptitle("Countplot")
    plt.show()

    if tag is not None:
        uid_tag_dict = dict(zip(tag["UID"], tag["Tag"]))
        data["tag"] = data["UID"].map(dict)
        print("#" * 100)
        print("Barplot")
        plt.figure(figsize = (15, 10))
        i = 1
        for column in data.columns:
            if data[column].unique().shape[0] < 50 and column != "tag":
                print("Plot " + column + "...")
                plt.subplot(2, 4, i)
                sns.barplot(x = column, y = "tag", data = data.sample(100000))
                i += 1
        plt.suptitle("Barplot")
        plt.show()
        
    print("Data Exploration Done")
    
# operation
# 时间日期：day, time
# 操作：mode, success
# 操作系统：os
# 版本：version
# 设备：device1, device2, device_code1, device_code2, device_code3
# ip：ip1, ip2, ip1_sub, ip2_sub
# mac：mac1, mac2
# wifi：wifi
# 地理位置：geo_code

# transaction
# 平台：channel
# 日期时间：day, time
# 资金：trans_amt, bal, amt_src1, amt_src2
# 商户：merchant, code1, code2
# 交易类型：trans_type1, trans_typ2
# 账户：acc_id1, acc_id2
# 设备：device_code1, device_code2, device_code3, device1, device2
# ip：ip1, ip1_sub
# mac：mac1
# 地理位置：geocode
# 营销活动：market_code, market_type

# operation countplot结论
# 1 日期分布较为均匀，操作记录较多的日期有1，8，15，22，29
# 2 绝大多数操作记录都成功了
# 3 操作系统分布不均，最多的是102，最少的是101和107
# 4 版本中，操作记录最多的是7.0.9和7.0.5
# 5 黑白样本记录不平衡，黑记录 / 白记录 = 137371 / 1096976 = 0.125；黑白样本比例不平衡，黑样本 / 白样本 = 4285 / 26894 = 0.16

# operation barplot结论
# 1 日期分布较为均匀，27附近的黑样本比例较高
# 2 操作成功的黑样本比例较高
# 3 操作系统为107的样本均为白样本，105的样本均为黑样本
# 4 版本为6.1.0，4.1.7的样本均为黑样本，6.5.0，7.0.0，6.6.3的黑样本比例较高，许多版本的样本均为白样本

In [5]:
"""
    Step 3 : Feature Engineering
    
    Feature Creation
    Feature Selection
    
"""

def FeatureCreation(data, tag):
    print("Feature Creation Start")
    data["hour"] = data["time"].apply(lambda x : int(x[:2]))
    features = pd.DataFrame(tag["UID"])
    
    # feature1: column_nunique
    for column in data.columns:
        if column != "UID":
            print("Create " + column + "_nunique...")
            column_nunique = data.groupby("UID")[column].agg(["nunique"]).reset_index().rename(columns = {"nunique": column + "_nunique"})
            features = features.merge(column_nunique, on = "UID", how = "left")

    # feature2: column_nunique_UID and column_count_UID
    data_copy = data.copy()
    for column in data.columns: 
        if column != "UID":
            print("Create " + column + "_nunique_UID and " + column + "_count_UID...")
            column_nunique_count = data_copy.groupby(column)["UID"].agg(["nunique", "count"]).reset_index().rename(columns = {"nunique": column + "_nunique_UID", "count": column + "_count_UID"})
            data_copy = data_copy.merge(column_nunique_count, on = column, how = "left")

    column_nunique = [col + "_nunique_UID" for col in data.columns if col != "UID"]
    column_count = [col + "_count_UID" for col in data.columns if col != "UID"]
    columns = column_nunique + column_count
    
    for column in columns:
        print("Create " + column + "...")
        column_nunique_count_UID = data_copy.groupby("UID")[column].agg(["max", "min", "mean"]).reset_index().rename(columns = {"max": column + "_max", "min": column + "_min", "mean": column + "_mean"})
        features = features.merge(column_nunique_count_UID, on = "UID", how = "left")
    
    # feature3: day_frequency, hour_frequency
    print("Create day_frequency...")  
    day_frequency = data.groupby(["UID", "day"])["time"].agg(["count"]).reset_index().groupby("UID")["count"].agg(["max", "min", "mean"]).rename(columns = {"max": "day_frequency_max", "min": "day_frequency_min", "mean": "day_frequency_mean"})
    features = features.merge(day_frequency, on = "UID", how = "left")
    hour_frequency = data.groupby(["UID", "day", "hour"])["time"].agg(["count"]).reset_index().groupby("UID")["count"].agg(["max", "min", "mean"]).rename(columns = {"max": "hour_frequency_max", "min": "hour_frequency_min", "mean": "hour_frequency_mean"})
    features = features.merge(hour_frequency, on = "UID", how = "left")

    print("Feature Creation Done")
    
    return features

def FeatureSelection(x_train, y_train, x_test):
    print("Feature Selection Start")
    sfm = SelectFromModel(GradientBoostingClassifier())
    sfm.fit(x_train, y_train)
    support = sfm.get_support()
    indices = list(range(len(support)))
    selected_indices = [index for index in indices if support[index]]
    selected_features = x_train.columns.values[selected_indices]
    x_train = x_train.loc[:, selected_features]
    x_temp = pd.DataFrame(columns = selected_features)
    for feature in selected_features:
        if feature in x_test.columns:
            x_temp[feature] = x_test[feature]
    x_test = x_temp
    print("Feature Selection Done")
    return x_train, x_test

def FeatureEngineering(operation_train, transaction_train, tag_train, operation_test, transaction_test, tag_test):
    print("Feature Engineering Start")
    operation_train_features = FeatureCreation(operation_train, tag_train)
    transaction_train_features = FeatureCreation(transaction_train, tag_train)
    operation_test_features = FeatureCreation(operation_test, tag_test)
    transaction_test_features = FeatureCreation(transaction_test, tag_test)
    x_train = operation_train_features.merge(transaction_train_features, on = "UID", how = "left")
    y_train = tag_train["Tag"]
    x_test = operation_test_features.merge(transaction_test_features, on = "UID", how = "left")
    x_train = x_train.fillna(-1)
    x_test = x_test.fillna(-1)
    x_train, x_test = FeatureSelection(x_train, y_train, x_test)
    print("Feature Engineering Done")
    return x_train, x_test

In [6]:
"""
    Step 4 : Model Optimization
    
    Models : lr, gbdt, xgb
    
"""

def ModelOptimization(model, params, x_train, y_train):
    print("Model Optimizatioin Start")
    x_train = x_train.fillna(-1)
    best_params = []
#     cv = GridSearchCV(estimator = model, param_grid = params, scoring = "roc_auc", cv = 3, n_jobs = -1)
#     cv.fit(x_train, y_train)
    for param in params:
        print("Optimize param", param, "...")
        cv = GridSearchCV(estimator = model, param_grid = param, scoring = "roc_auc", cv = 3, n_jobs = -1)
        cv.fit(x_train, y_train)
        best_params.append(cv.best_params_)
    print("Model Optimizatioin Done")
    return best_params

In [7]:
"""
    Step 5 : Model Evaluation
    
"""

def tpr_weight_function(y_true, y_predict):
    d = pd.DataFrame()
    d['prob'] = list(y_predict)
    d['y'] = list(y_true)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    return 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3

def ModelEvaluation(model, x_train, y_train):
    print("Model Evaluation Start")
    x_train = x_train.fillna(-1)
    # roc_auc
    print("Compute roc_auc_score...")
    roc_auc = np.mean(cross_val_score(estimator = model, 
                                         X = x_train, 
                                         y = y_train, 
                                         scoring = "roc_auc", 
                                         cv = 3, 
                                         n_jobs = -1, 
                                         verbose = 10))

    # tpr_weight
    print("Compute tpr_weight_score...")
    kf = KFold(n_splits = 3)
    model_scores = []
    for train_index, test_index in kf.split(x_train):
        print("Split data...")
        x_tr, x_te = x_train.values[train_index], x_train.values[test_index]
        y_tr, y_te = y_train.values[train_index], y_train.values[test_index]
        model.fit(x_tr, y_tr)
        y_pred = model.predict(x_te)
        score = tpr_weight_function(y_te, y_pred)
        model_scores.append(score)
    tpr_weight = np.mean(model_scores)
    print("Model Evaluation Done")
    return roc_auc, tpr_weight

def Record(x_train, model, roc_auc_score, tpr_weight_score):
    print("Record Start")
    with open("../Records/record.txt", "a") as f:
        f.write("features:\t")
        f.write("[" + ", ".join(x_train.columns.values) + "]")
        f.write("\n\n")
        f.write("model:\t")
        f.write(str(model))
        f.write("\n\n")
        f.write("roc_auc_score:\t")
        f.write(str(roc_auc_score))
        f.write("\n\n")
        f.write("tpr_weight_score:\t")
        f.write(str(tpr_weight_score))
        f.write("\n")
        f.write("#" * 100)
        f.write("\n")
    print("Record Done")

In [8]:
"""
    Step 6 : Fit and Predict
    
"""

def FitPredict(model, x_train, y_train, x_test):
    print("Fit Predict Start")
    model.fit(x_train, y_train)
    y_pred = model.predict_proba(x_test)[:, 1]
    print("Fit Predict Done")
    return y_pred

In [9]:
"""
    Step 7 : Ensembling

"""

def Ensembling(y_pred_list, tag_test):
    print("Ensembling Start")
    ensembling_y_pred = np.array(y_pred_list).mean(axis = 0)
    print("Ensembling Done")
    return ensembling_y_pred

In [10]:
"""
    Step 8 : Submit
    
"""
def Submit(y_pred, tag_test):
    print("Submit Start")
    submission = pd.DataFrame({"UID": tag_test["UID"], "Tag": y_pred})
    print("Submit Done")
    return submission

In [190]:
# Get Data
t_get_data_start = time.time()
operation_train, transaction_train, tag_train, operation_test, transaction_test, tag_test = GetData("../Data/")
t_get_data_end = time.time()

# Data Preprocessing
t_data_preprocessing_start = time.time()
operation_train = DataPreprocessing(operation_train)
transaction_train = DataPreprocessing(transaction_train)
operation_test = DataPreprocessing(operation_test)
transaction_test = DataPreprocessing(transaction_test)
t_data_preprocessing_end = time.time()

# # Data Exploration
# t_data_exploration_start = time.time()
# DataExploration(operation_train, tag_train)
# DataExploration(transaciont_train, tag_train)
# DataExploration(opearation_test)
# DataExploration(transaction_test)
# t_data_exploration_end = time.time()

# Feature Engineering
t_feature_engineering_start = time.time()
x_train, x_test = FeatureEngineering(operation_train, transaction_train, tag_train, operation_test, transaction_test, tag_test)
t_feature_engineering_end = time.time()

# Model Optimization
t_model_optimization_start = time.time()
lr_params = [{"C": [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10]}, 
             {"class_weight": [None, "balanced"]}, 
             {"max_iter": [100, 300, 500, 1000]}, 
             {"penalty": ["l1", "l2"]},
             {"solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}]
lr_best_params = ModelOptimization(LogisticRegression(), lr_params, x_train, tag_train["Tag"])

gbdt_params = [{"n_estimators": [100, 300, 500, 1000]}, 
               {"learning_rate": [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]}, 
               {"max_features": [None, "log2", "sqrt"]}, 
               {"max_depth": [3, 5, 7, 9]}, 
               {"min_samples_split": [2, 4, 6, 8]}, 
               {"min_samples_leaf": [1, 3, 5, 7]}]
gbdt_best_params = ModelOptimization(GradientBoostingClassifier(), gbdt_params, x_train, tag_train["Tag"])

xgb_params = [{"learning_rate": [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1]}, 
              {"n_estimators": [100, 300, 500, 1000]}, 
              {"max_depth": range(3,10,2)}, 
              {"min_child_weight": range(1,6,2)}, 
              {"gamma": [i/10.0 for i in range(0,5)]}, 
              {"subsample": [i/10.0 for i in range(6,10)]},
              {"colsample_bytree": [i/10.0 for i in range(6,10)]}, 
              {"reg_alpha": [1e-5, 1e-2, 0.1, 1, 100]}]
xgb_best_params = ModelOptimization(XGBClassifier(), xgb_params, x_train, tag_train["Tag"])
t_model_optimization_end = time.time()

# Model Evaluation
t_model_evaluation_start = time.time()
lr = LogisticRegression(C = 10, class_weight = "balanced", max_iter = 1000, penalty = "l2", solver = "newton-cg")
lr_roc_auc_score, lr_tpr_weight_score = ModelEvaluation(lr, x_train, tag_train["Tag"])
Record(x_train, lr, lr_roc_auc_score, lr_tpr_weight_score)

gbdt = GradientBoostingClassifier(n_estimators = 1000, learning_rate = 0.3, max_features = None, 
                                  max_depth = 3, min_samples_split = 4, min_samples_leaf = 7)
gbdt_roc_auc_score, gbdt_tpr_weight_score = ModelEvaluation(gbdt, x_train, tag_train["Tag"])
Record(x_train, gbdt, gbdt_roc_auc_score, gbdt_tpr_weight_score)

xgb = XGBClassifier(n_estimators = 300, learning_rate = 0.3, max_depth = 5, min_child_weight = 1, 
                    gamma = 0.1, subsample = 0.8, colsample_bytree = 0.8, reg_alpha = 1e-05)
xgb_roc_auc_score, xgb_tpr_weight_score = ModelEvaluation(xgb, x_train, tag_train["Tag"])
Record(x_train, xgb, xgb_roc_auc_score, xgb_tpr_weight_score)
t_model_evaluation_end = time.time()

# Fit and Predict
t_fit_predict_start = time.time()
lr_y_pred = FitPredict(lr, x_train, tag_train["Tag"], x_test)
gbdt_y_pred = FitPredict(gbdt, x_train, tag_train["Tag"], x_test)
xgb_y_pred = FitPredict(xgb, x_train, tag_train["Tag"], x_test)
t_fit_predict_end = time.time()
print("t_fit_predict: ", t_fit_predict_end - t_fit_predict_start)

# Ensembling
t_ensembling_start = time.time()
y_pred_list = [lr_y_pred, gbdt_y_pred, xgb_y_pred]
ensembling_y_pred = Ensembling(y_pred_list, tag_test)
t_ensembling_end = time.time()

# Submit
t_submit_start = time.time()
lr_submission = Submit(lr_y_pred, tag_test)
gbdt_submission = Submit(gbdt_y_pred, tag_test)
xgb_submission = Submit(xgb_y_pred, tag_test)
ensembling_submission = Submit(ensembling_y_pred, tag_test)
lr_submission.to_csv("../Submission/lr_submission.csv", index = False)
gbdt_submission.to_csv("../Submission/gbdt_submission.csv", index = False)
xgb_submission.to_csv("../Submission/xgb_submission.csv", index = False)
ensembling_submission.to_csv("../Submission/ensembling_submission.csv", index = False)
t_submit_end = time.time()

print("t_get_data: ", t_get_data_end - t_get_data_start)
print("t_data_preprocessing: ", t_data_preprocessing_end - t_data_preprocessing_start)
# print("t_data_exploration: ", t_data_exploration_end - t_data_exploration_start)
print("t_feature_engineering: ", t_feature_engineering_end - t_feature_engineering_start)
print("t_model_optimization: ", t_model_optimization_end - t_model_optimization_start)
print("t_model_evaluation: ", t_model_evaluation_end - t_model_evaluation_start)
print("t_ensembling: ", t_ensembling_end - t_ensembling_start)
print("t_submit: ", t_submit_end - t_submit_start)
print("total_time: ", t_submit_end - t_get_data_start)

Get Data Start


  if (yield from self.run_code(code, result)):


Get Data Done
Data Preprocessing Start
Data Preprocessing Done
Data Preprocessing Start
Data Preprocessing Done
Data Preprocessing Start
Data Preprocessing Done
Data Preprocessing Start
Data Preprocessing Done
Feature Engineering Start
Feature Creation Start
Create day_nunique...
Create mode_nunique...
Create success_nunique...
Create time_nunique...
Create os_nunique...
Create version_nunique...
Create device1_nunique...
Create device2_nunique...
Create device_code1_nunique...
Create device_code2_nunique...
Create device_code3_nunique...
Create mac1_nunique...
Create mac2_nunique...
Create ip1_nunique...
Create ip2_nunique...
Create wifi_nunique...
Create geo_code_nunique...
Create ip1_sub_nunique...
Create ip2_sub_nunique...
Create hour_nunique...
Create day_nunique_UID and day_count_UID...
Create mode_nunique_UID and mode_count_UID...
Create success_nunique_UID and success_count_UID...
Create time_nunique_UID and time_count_UID...
Create os_nunique_UID and os_count_UID...
Create ver

Create hour_nunique_UID and hour_count_UID...
Create day_nunique_UID...
Create mode_nunique_UID...
Create success_nunique_UID...
Create time_nunique_UID...
Create os_nunique_UID...
Create version_nunique_UID...
Create device1_nunique_UID...
Create device2_nunique_UID...
Create device_code1_nunique_UID...
Create device_code2_nunique_UID...
Create device_code3_nunique_UID...
Create mac1_nunique_UID...
Create mac2_nunique_UID...
Create ip1_nunique_UID...
Create ip2_nunique_UID...
Create wifi_nunique_UID...
Create geo_code_nunique_UID...
Create ip1_sub_nunique_UID...
Create ip2_sub_nunique_UID...
Create hour_nunique_UID...
Create day_count_UID...
Create mode_count_UID...
Create success_count_UID...
Create time_count_UID...
Create os_count_UID...
Create version_count_UID...
Create device1_count_UID...
Create device2_count_UID...
Create device_code1_count_UID...
Create device_code2_count_UID...
Create device_code3_count_UID...
Create mac1_count_UID...
Create mac2_count_UID...
Create ip1_coun



Optimize param {'class_weight': [None, 'balanced']} ...




Optimize param {'max_iter': [100, 300, 500, 1000]} ...




Optimize param {'penalty': ['l1', 'l2']} ...




Optimize param {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']} ...
Model Optimizatioin Done
Model Optimizatioin Start
Optimize param {'n_estimators': [100, 300, 500, 1000]} ...
Optimize param {'learning_rate': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]} ...
Optimize param {'max_features': [None, 'log2', 'sqrt']} ...
Optimize param {'max_depth': [3, 5, 7, 9]} ...
Optimize param {'min_samples_split': [2, 4, 6, 8]} ...
Optimize param {'min_samples_leaf': [1, 3, 5, 7]} ...
Model Optimizatioin Done
Model Optimizatioin Start
Optimize param {'learning_rate': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1]} ...
Optimize param {'n_estimators': [100, 300, 500, 1000]} ...
Optimize param {'max_depth': range(3, 10, 2)} ...
Optimize param {'min_child_weight': range(1, 6, 2)} ...
Optimize param {'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]} ...
Optimize param {'subsample': [0.6, 0.7, 0.8, 0.9]} ...
Optimize param {'colsample_bytree': [0.6, 0.7, 0.8, 0.9]} ...
Optimize param {'reg_alpha': [1e-05, 0.01, 0.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   43.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   43.5s finished


Compute tpr_weight_score...
Split data...




Split data...




Split data...




Model Evaluation Done
Record Start
Record Done
Model Evaluation Start
Compute roc_auc_score...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   27.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   27.2s finished


Compute tpr_weight_score...
Split data...
Split data...
Split data...
Model Evaluation Done
Record Start
Record Done
Model Evaluation Start
Compute roc_auc_score...


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   14.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   14.0s finished


Compute tpr_weight_score...
Split data...
Split data...
Split data...
Model Evaluation Done
Record Start
Record Done
Fit Predict Start




Fit Predict Done
Fit Predict Start
Fit Predict Done
Fit Predict Start
Fit Predict Done
t_fit_predict:  80.91845035552979
Ensembling Start
Ensembling Done
Submit Start
Submit Done
Submit Start
Submit Done
Submit Start
Submit Done
Submit Start
Submit Done
t_get_data:  28.466918230056763
t_data_preprocessing:  10.8610098361969
t_feature_engineering:  319.28134751319885
t_model_optimization:  568.4811298847198
t_model_evaluation:  257.55098581314087
t_ensembling:  0.0019953250885009766
t_submit:  0.46979618072509766
total_time:  1266.0316331386566
