In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import bisect

import matplotlib.pyplot as plt
%matplotlib inline  

from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from tqdm import tqdm

# pd.options.display.max_rows = 4000
pd.options.display.max_columns = 4000
# pd.options.display.max_seq_items = 2000

In [2]:
training_df = pd.read_csv('df_log/228/training_self/training_self_0323.csv')
dev_df = pd.read_csv('df_log/228/training_self/dev_self_0323.csv')
test_df = pd.read_csv('df_log/228/training_self/test_self_0323.csv')

training_df = training_df.fillna(0)
dev_df = dev_df.fillna(0)
test_df = test_df.fillna(0)

In [3]:
five_folds = [[[0, 1, 2, 3], [4]], [[0, 1, 2, 4], [3]], [[0, 1, 3, 4], [2]], [[0, 2, 3, 4], [1]], [[1, 2, 3, 4], [0]]]

In [4]:
import lightgbm as lgb

In [5]:
layer1_df = pd.DataFrame()

In [6]:
useful_columns = ['bin_0_affect_user_ratio', 'first_24h_each_user_event_avg', 'bin_ratio_max', \
                  'customer_has_static_b3_count', 'hour_occupy_ratio', 'bin_9_affect_user_ratio', \
                  'dayofweek_ratio_std', 'event_diff_time_rstd', 'each_user_event_std', \
                  'event_diff_time_lessone_ratio2', 'bin_7_affect_user_ratio', 'first_24h_affect_user_ratio', \
                  'all_event_diff_time_mean', 'product_virus_ratio_max', 'customer_virus_zero_count', \
                  'customer_spread_time_mean', 'customer_virus_ratio_rstd', 'first_24h_affect_user_count', \
                  'hour_ratio_max', 'hour_ratio_std', 'customer_has_static_b3_ratio', 'first_24h_ratio', \
                  'customer_has_static_ratio', 'customer_has_static_count', 'event_diff_time_median', \
                  'hour_ratio_mean', 'first_24h_count', 'customer_virus_ratio_median', 'event_diff_time_median2', \
                  'all_event_diff_time_median', 'customer_virus_ratio_std', 'customer_virus_ratio_avg', \
                  'customer_virus_ratio_max', 'each_user_event_avg', 'customer_virus_zero_ratio', \
                  'customer_virus_zero_ratio2', 'customer_virus_ratio_wavg']

# drop_features = ['fold', 'label', 'FileID', 'first_time_occur', 'last_time_occur']
# useful_columns = [item for item in training_df.columns.tolist() if item not in drop_features]

In [7]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'l2', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'lambda_l2': 5.0,
    'min_gain_to_split': 0,
    'min_data_in_leaf': 8,
    'max_depth': 15
}

lgb_train = lgb.Dataset(training_df[useful_columns].values, training_df['label'].values)
lgb_eval = lgb.Dataset(dev_df[useful_columns].values, dev_df['label'].values, reference=lgb_train)
gbm = lgb.train(params,
            lgb_train,
            num_boost_round=105,
            valid_sets=lgb_eval)

fpr, tpr, thresholds = roc_curve(training_df['label'], gbm.predict(training_df[useful_columns].values, num_iteration=gbm.best_iteration))
print(auc(fpr, tpr))
fpr, tpr, thresholds = roc_curve(dev_df['label'], gbm.predict(dev_df[useful_columns].values, num_iteration=gbm.best_iteration))
print(auc(fpr, tpr))
fpr, tpr, thresholds = roc_curve(test_df['label'], gbm.predict(test_df[useful_columns].values, num_iteration=gbm.best_iteration))
print(auc(fpr, tpr))

[1]	valid_0's auc: 0.867406	valid_0's l2: 0.229709
[2]	valid_0's auc: 0.884967	valid_0's l2: 0.211336
[3]	valid_0's auc: 0.917716	valid_0's l2: 0.19489
[4]	valid_0's auc: 0.924794	valid_0's l2: 0.179933
[5]	valid_0's auc: 0.941848	valid_0's l2: 0.166534
[6]	valid_0's auc: 0.942944	valid_0's l2: 0.154266
[7]	valid_0's auc: 0.942937	valid_0's l2: 0.143221
[8]	valid_0's auc: 0.943616	valid_0's l2: 0.133344
[9]	valid_0's auc: 0.94344	valid_0's l2: 0.124365
[10]	valid_0's auc: 0.943383	valid_0's l2: 0.11622
[11]	valid_0's auc: 0.946323	valid_0's l2: 0.108895
[12]	valid_0's auc: 0.946273	valid_0's l2: 0.102197
[13]	valid_0's auc: 0.946594	valid_0's l2: 0.096143
[14]	valid_0's auc: 0.946836	valid_0's l2: 0.0908348
[15]	valid_0's auc: 0.947006	valid_0's l2: 0.0858576
[16]	valid_0's auc: 0.948286	valid_0's l2: 0.0814451
[17]	valid_0's auc: 0.94975	valid_0's l2: 0.0773883
[18]	valid_0's auc: 0.950259	valid_0's l2: 0.0737298
[19]	valid_0's auc: 0.950417	valid_0's l2: 0.0704479
[20]	valid_0's auc:

In [8]:
layer1_df['lgbm'] = gbm.predict(dev_df[useful_columns].values, num_iteration=gbm.best_iteration)

In [9]:
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn import preprocessing

In [10]:
scaler = preprocessing.StandardScaler().fit(training_df[useful_columns])

In [11]:
predict_results = []
clssfis = []
for idx, fold in enumerate(five_folds):
    merge_folds = fold[0]
    fold_training = training_df[training_df['fold'].isin(merge_folds)]

    clssfi = MLPClassifier(random_state=0, hidden_layer_sizes=(10, 3, 3), alpha=0.2)
    clssfi.fit(scaler.transform(fold_training[useful_columns]), fold_training['label'])
    fpr, tpr, thresholds = roc_curve(fold_training['label'], clssfi.predict_proba(scaler.transform(fold_training[useful_columns]))[:, 1])
    print(auc(fpr, tpr))
    fpr, tpr, thresholds = roc_curve(dev_df['label'], clssfi.predict_proba(scaler.transform(dev_df[useful_columns]))[:, 1])
    print(auc(fpr, tpr))
    
    clssfis.append(clssfi)
    predict_results.append(clssfi.predict_proba(scaler.transform(dev_df[useful_columns]))[:, 1])

0.9718558579536284
0.9587351251557061
0.9715117529363307
0.9569112471337253
0.9714789530264757
0.957393543481784
0.9708278533201468
0.957467194472625
0.973133717847573
0.9583769232409088


In [12]:
fpr, tpr, thresholds = roc_curve(dev_df['label'], np.mean(predict_results, axis=0))
print(auc(fpr, tpr))
layer1_df['nn'] = np.mean(predict_results, axis=0)

0.9608372085661308


In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
def transform_x(df, scaler):
    temp_df = scaler.transform(df)
    
    new_X = temp_df
    
    new_X = np.hstack((temp_df**(i+1) for i in range(2)))
    
    return new_X

In [36]:
clssfi = LogisticRegression(C=0.05, class_weight='balanced')
clssfi.fit(transform_x(training_df[useful_columns], scaler), training_df['label'])

LogisticRegression(C=0.05, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [37]:
fpr, tpr, thresholds = roc_curve(training_df['label'], clssfi.predict_proba(transform_x(training_df[useful_columns], scaler))[:, 1])
print(auc(fpr, tpr))
fpr, tpr, thresholds = roc_curve(dev_df['label'], clssfi.predict_proba(transform_x(dev_df[useful_columns], scaler))[:, 1])
print(auc(fpr, tpr))

0.9625196577029743
0.9506964724312505


In [17]:
layer1_df['logit'] = clssfi.predict_proba(transform_x(dev_df[useful_columns], scaler))[:, 1]

In [18]:
layer1_df.corr()

Unnamed: 0,lgbm,nn,logit
lgbm,1.0,0.961342,0.836455
nn,0.961342,1.0,0.857025
logit,0.836455,0.857025,1.0


In [19]:
fpr, tpr, thresholds = roc_curve(dev_df['label'], np.mean(layer1_df, axis=1))
print(auc(fpr, tpr))
fpr, tpr, thresholds = roc_curve(dev_df['label'], layer1_df['logit']*0.5+layer1_df['lgbm']*0.5)
print(auc(fpr, tpr))
fpr, tpr, thresholds = roc_curve(dev_df['label'], layer1_df['nn']*0.5+layer1_df['lgbm']*0.5)
print(auc(fpr, tpr))
fpr, tpr, thresholds = roc_curve(dev_df['label'], layer1_df['nn']*0.5+layer1_df['logit']*0.5)
print(auc(fpr, tpr))

0.9598294091900037
0.9582729705355398
0.964297022681867
0.9563370831170327


In [22]:
fpr, tpr, thresholds = roc_curve(dev_df['label'], layer1_df['logit']*0.2+layer1_df['nn']*0.4+layer1_df['lgbm']*0.4)
print(auc(fpr, tpr))

0.9617155304985332


In [25]:
fpr, tpr, thresholds = roc_curve(dev_df['label'], layer1_df['nn']*0.4+layer1_df['lgbm']*0.6)
print(auc(fpr, tpr))

0.9643768053137656


In [26]:
layer1_df = pd.DataFrame()

In [27]:
predict_results = []
for clssfi in clssfis:
    predict_results.append(clssfi.predict_proba(scaler.transform(test_df[useful_columns]))[:, 1])

In [28]:
fpr, tpr, thresholds = roc_curve(test_df['label'], np.mean(predict_results, axis=0))
print(auc(fpr, tpr))
layer1_df['nn'] = np.mean(predict_results, axis=0)

0.9541313091558354


In [30]:
fpr, tpr, thresholds = roc_curve(test_df['label'], gbm.predict(test_df[useful_columns].values, num_iteration=gbm.best_iteration))
print(auc(fpr, tpr))
layer1_df['lgbm'] = gbm.predict(test_df[useful_columns].values, num_iteration=gbm.best_iteration)

0.9609144316240137


In [43]:
fpr, tpr, thresholds = roc_curve(test_df['label'], layer1_df['nn']*0.4+layer1_df['lgbm']*0.6)
print(auc(fpr, tpr))

0.9611493750839941


In [38]:
layer1_df['logit'] = clssfi.predict_proba(transform_x(test_df[useful_columns], scaler))[:, 1]

In [40]:
fpr, tpr, thresholds = roc_curve(test_df['label'], layer1_df['logit']*0.2+layer1_df['nn']*0.4+layer1_df['lgbm']*0.4)
print(auc(fpr, tpr))

0.9574315568183999
