In [1]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from time import time
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def to_one_hot(train_data, test_data, category_columns, numeric_columns, columns):
    trans_index = [columns.index(cat) for cat in category_columns]
    numeric_index = [columns.index(num) for num in numeric_columns]
    
    enc_list = [None]*len(category_columns)
    train_data_list = [None]*len(category_columns)
    test_data_list = [None]*len(category_columns)
    new_columns = []
    
    for i, index in enumerate(trans_index):
        enc_list[i] = OneHotEncoder(handle_unknown='ignore')
        train_data_list[i] = enc_list[i].fit_transform(train_data[:, [index]]).toarray()
        test_data_list[i] = enc_list[i].transform(test_data[:, [index]]).toarray()
        
        new_columns.extend([columns[index]+'_'+str(j) for j in range(len(enc_list[i].get_feature_names()))])

    train_data_list.append(train_data[:, numeric_index])
    test_data_list.append(test_data[:, numeric_index])
    new_columns.extend(numeric_columns)
    
    trans_train_data = np.hstack(train_data_list)
    trans_test_data = np.hstack(test_data_list)
    
    return trans_train_data, trans_test_data, enc_list, new_columns

In [3]:
def build_xgb_model(dtrain, dtest, params, iteration=1000):
    
    t0 = time()
    evals_result = dict()

    xg_reg = xgb.train(params=params, 
                       dtrain=dtrain, 
                       num_boost_round=iteration,  
                       early_stopping_rounds=100, 
                       evals=[(dtrain,'train'), (dtest,'test')], 
                       evals_result = evals_result,
                       verbose_eval=50)
    
    print('training cost time:', time() - t0)
    
    return xg_reg, evals_result

In [4]:
def cal_regession_score(true, pred):
    
    scores = {
        'RMSE': round(mean_squared_error(true, pred, squared=False), 2), 
        'MAE(mean)': round(mean_absolute_error(true, pred), 2), 
        'MAE(median)': round(median_absolute_error(true, pred), 2)
    }    
    
    return scores

def cal_classification_score(true, pred):
    
    scores = {
        'AccuracyScore': round(accuracy_score(true, pred), 4), 
        'RecallScore': round(recall_score(true, pred, average='macro'), 4),
        'PrecisionScore': round(precision_score(true, pred, average='macro'), 4),
        'F1Score': round(f1_score(true, pred, average='macro'), 4),
    }      
    
    return scores

In [5]:
sample_path = './data/sample_50k'

x_train = np.load(os.path.join(sample_path, 'Normal', 'x_train.npy'), allow_pickle=True)
x_test = np.load(os.path.join(sample_path, 'Normal', 'x_test.npy'), allow_pickle=True)

Y_train = np.load(os.path.join(sample_path, 'Normal', 'y_train.npy'), allow_pickle=True)
Y_test = np.load(os.path.join(sample_path, 'Normal', 'y_test.npy'), allow_pickle=True)

chid_mapper = np.load(os.path.join(sample_path, 'sample_50k_chid_idx_map.npy'), allow_pickle=True).item()
feat_mapper = np.load(os.path.join(sample_path, 'Normal', 'feature_map.npy'), allow_pickle=True).item()

columns = np.load(os.path.join(sample_path, 'Normal', 'columns.npy'), allow_pickle=True).item()

print(x_train.shape, x_test.shape, Y_train.shape, Y_test.shape, len(chid_mapper))
print([(k, len(v)) for k, v in feat_mapper.items()])

(1033871, 541) (100000, 541) (1033871, 6) (100000, 6) 50000
[('masts', 3), ('educd', 6), ('naty', 2), ('trdtp', 27), ('poscd', 9), ('cuorg', 30)]


In [6]:
print(columns['y_columns'])

['chid', 'data_dt', 'objam_sum', 'objam_mean', 'trans_count', 'shop_count']


In [8]:
category_cols = columns['x_columns'][:7]
numeric_cols = columns['x_columns'][7:-1]

print(category_cols, len(numeric_cols))

['chid', 'masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg'] 534


In [9]:
trans_x_train, trans_x_test, enc_list, trans_columns = to_one_hot(x_train, x_test, category_cols[1:], numeric_cols[:], 
                                                                  columns['x_columns'][:-1])
trans_x_train.shape, trans_x_test.shape, len(trans_columns)

((1033871, 611), (100000, 611), 611)

## objam_sum

In [10]:
## ['reg:reg:squarederror', 'reg:squaredlogerror']
## ['multi:softmax', 'multi:softprob']

params = {'objective':'reg:squarederror', 'learning_rate': 0.05, 'max_depth': 5, 
          'subsample':0.7, 'min_child_weight':4, 'eval_metric':'rmse', 'n_jobs':40}

In [11]:
# regession
index = columns['y_columns'].index('objam_sum')
train_objsum = Y_train[:, [index]].astype(np.float64)
test_objsum = Y_test[:, [index]].astype(np.float64)

dtrain = xgb.DMatrix(data=trans_x_train, label=train_objsum, feature_names=trans_columns)
dtest  = xgb.DMatrix(data=trans_x_test, label=test_objsum, feature_names=trans_columns) 

In [12]:
xg_objsum, evals_result = build_xgb_model(dtrain, dtest, params)

[0]	train-rmse:847072.56250	test-rmse:612736.43750
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:776249.06250	test-rmse:505209.62500
[100]	train-rmse:765608.12500	test-rmse:495661.59375
[150]	train-rmse:758501.75000	test-rmse:488269.18750
[200]	train-rmse:753843.31250	test-rmse:488051.56250
[250]	train-rmse:749900.06250	test-rmse:487316.46875
[300]	train-rmse:746428.50000	test-rmse:484258.90625
[350]	train-rmse:743447.25000	test-rmse:483936.12500
[400]	train-rmse:735649.00000	test-rmse:484672.06250
Stopping. Best iteration:
[334]	train-rmse:744354.87500	test-rmse:482252.00000

training cost time: 492.8558645248413


In [13]:
train_pred = xg_objsum.predict(dtrain, ntree_limit=xg_objsum.best_ntree_limit)
test_pred = xg_objsum.predict(dtest, ntree_limit=xg_objsum.best_ntree_limit)

train_pred.shape, test_pred.shape

((1033871,), (100000,))

In [14]:
train_scores = cal_regession_score(train_pred, dtrain.get_label())
test_scores = cal_regession_score(test_pred, dtest.get_label())

train_scores, test_scores

({'RMSE': 744367.25, 'MAE(mean)': 77246.84, 'MAE(median)': 29321.8},
 {'RMSE': 482252.1, 'MAE(mean)': 85155.9, 'MAE(median)': 29321.8})

## trans_count

In [15]:
## ['reg:reg:squarederror', 'reg:squaredlogerror']
## ['multi:softmax', 'multi:softprob']

params = {'objective':'reg:squarederror', 'learning_rate': 0.05, 'max_depth': 5, 
          'subsample':0.7, 'min_child_weight':4, 'eval_metric':'rmse', 'n_jobs':40}

In [16]:
# regession
index = columns['y_columns'].index('trans_count')
train_tscnt = Y_train[:, [index]].astype(np.float64)
test_tscnt = Y_test[:, [index]].astype(np.float64)

dtrain = xgb.DMatrix(data=trans_x_train, label=train_tscnt, feature_names=trans_columns)
dtest  = xgb.DMatrix(data=trans_x_test, label=test_tscnt, feature_names=trans_columns) 

In [17]:
xg_tscnt, evals_result = build_xgb_model(dtrain, dtest, params)

[0]	train-rmse:9.79032	test-rmse:18.08149
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:4.78807	test-rmse:10.16167
[100]	train-rmse:4.59638	test-rmse:9.56393
[150]	train-rmse:4.53524	test-rmse:9.54441
[200]	train-rmse:4.48769	test-rmse:9.56729
[250]	train-rmse:4.44222	test-rmse:9.58942
Stopping. Best iteration:
[157]	train-rmse:4.52940	test-rmse:9.49582

training cost time: 278.20790338516235


In [18]:
train_pred = xg_tscnt.predict(dtrain, ntree_limit=xg_tscnt.best_ntree_limit)
test_pred = xg_tscnt.predict(dtest, ntree_limit=xg_tscnt.best_ntree_limit)

train_pred.shape, test_pred.shape

((1033871,), (100000,))

In [19]:
train_scores = cal_regession_score(train_pred, dtrain.get_label())
test_scores = cal_regession_score(test_pred, dtest.get_label())

train_scores, test_scores

({'RMSE': 4.53, 'MAE(mean)': 2.42, 'MAE(median)': 1.29},
 {'RMSE': 9.5, 'MAE(mean)': 2.81, 'MAE(median)': 1.3})

## shop_count

In [20]:
## ['reg:reg:squarederror', 'reg:squaredlogerror']
## ['multi:softmax', 'multi:softprob']

params = {'objective':'reg:squarederror', 'learning_rate': 0.1, 'max_depth': 5, 
          'subsample':0.7, 'min_child_weight':4, 'eval_metric':'rmse', 'n_jobs':40}

In [21]:
# regession
index = columns['y_columns'].index('shop_count')
train_spcnt = Y_train[:, [index]].astype(np.float64)
test_spcnt = Y_test[:, [index]].astype(np.float64)

dtrain = xgb.DMatrix(data=trans_x_train, label=train_spcnt, feature_names=trans_columns)
dtest  = xgb.DMatrix(data=trans_x_test, label=test_spcnt, feature_names=trans_columns) 

In [22]:
xg_spcnt, evals_result = build_xgb_model(dtrain, dtest, params)

[0]	train-rmse:3.81768	test-rmse:4.83613
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:2.07166	test-rmse:2.32030
[100]	train-rmse:2.05293	test-rmse:2.30616
[150]	train-rmse:2.04326	test-rmse:2.30398
[200]	train-rmse:2.03537	test-rmse:2.30258
[250]	train-rmse:2.02962	test-rmse:2.30102
[300]	train-rmse:2.02414	test-rmse:2.30076
[350]	train-rmse:2.01934	test-rmse:2.30025
[400]	train-rmse:2.01482	test-rmse:2.30031
Stopping. Best iteration:
[345]	train-rmse:2.01974	test-rmse:2.30004

training cost time: 485.8045542240143


In [23]:
train_pred = xg_spcnt.predict(dtrain, ntree_limit=xg_spcnt.best_ntree_limit)
test_pred = xg_spcnt.predict(dtest, ntree_limit=xg_spcnt.best_ntree_limit)

train_pred.shape, test_pred.shape

((1033871,), (100000,))

In [24]:
train_scores = cal_regession_score(train_pred, dtrain.get_label())
test_scores = cal_regession_score(test_pred, dtest.get_label())

train_scores, test_scores

({'RMSE': 2.02, 'MAE(mean)': 1.32, 'MAE(median)': 0.81},
 {'RMSE': 2.3, 'MAE(mean)': 1.44, 'MAE(median)': 0.81})

## label_0

In [25]:
## ['reg:reg:squarederror', 'reg:squaredlogerror']
## ['binary:logistic', 'multi:softmax', 'multi:softprob']

params = {'objective':'multi:softmax', 'learning_rate': 0.1, 'max_depth': 5, 
          'subsample':0.7, 'min_child_weight':4, 'eval_metric':'merror', 'num_class':2, 'n_jobs':40}

In [26]:
#classfication
bounds = [0]
lable_trans = np.vectorize(lambda x: sum([x > bound for bound in bounds]))

train_label_0 = lable_trans(train_objsum)
test_label_0 = lable_trans(test_objsum)

dtrain = xgb.DMatrix(data=trans_x_train, label=train_label_0, feature_names=trans_columns)
dtest  = xgb.DMatrix(data=trans_x_test, label=test_label_0, feature_names=trans_columns) 

print(np.unique(train_label_0).shape[0], np.unique(test_label_0).shape[0], train_label_0.shape, test_label_0.shape)

2 2 (1033871, 1) (100000, 1)


In [27]:
xg_label_0, evals_result = build_xgb_model(dtrain, dtest, params, iteration=1000)

[0]	train-merror:0.18071	test-merror:0.15964
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 100 rounds.
[50]	train-merror:0.17893	test-merror:0.15876
[100]	train-merror:0.17824	test-merror:0.15833
[150]	train-merror:0.17778	test-merror:0.15817
[200]	train-merror:0.17732	test-merror:0.15787
[250]	train-merror:0.17706	test-merror:0.15775
[300]	train-merror:0.17686	test-merror:0.15766
[350]	train-merror:0.17661	test-merror:0.15752
[400]	train-merror:0.17645	test-merror:0.15752
[450]	train-merror:0.17626	test-merror:0.15759
Stopping. Best iteration:
[362]	train-merror:0.17660	test-merror:0.15743

training cost time: 996.2755992412567


In [28]:
train_pred = xg_label_0.predict(dtrain, ntree_limit=xg_label_0.best_ntree_limit)
test_pred = xg_label_0.predict(dtest, ntree_limit=xg_label_0.best_ntree_limit)

train_pred.shape, test_pred.shape

((1033871,), (100000,))

In [29]:
train_scores = cal_classification_score(train_pred, dtrain.get_label())
test_scores = cal_classification_score(test_pred, dtest.get_label())

train_scores, test_scores

({'AccuracyScore': 0.8234,
  'RecallScore': 0.7886,
  'PrecisionScore': 0.7797,
  'F1Score': 0.7839},
 {'AccuracyScore': 0.8426,
  'RecallScore': 0.8128,
  'PrecisionScore': 0.8062,
  'F1Score': 0.8093})

## label_multiple

In [30]:
## ['reg:reg:squarederror', 'reg:squaredlogerror']
## ['binary:logistic', 'multi:softmax', 'multi:softprob']

params = {'objective':'multi:softmax', 'learning_rate': 0.08, 'max_depth': 5, 
          'subsample':0.7, 'min_child_weight':4, 'eval_metric':'merror', 'num_class':6, 'n_jobs':40}

In [31]:
bounds = [0, 1e4, 5e4, 1e5, 3e5]
lable_trans = np.vectorize(lambda x: sum([x > bound for bound in bounds]))
train_label_mul = lable_trans(train_objsum)
test_label_mul = lable_trans(test_objsum)

dtrain = xgb.DMatrix(data=trans_x_train, label=train_label_mul, feature_names=trans_columns)
dtest  = xgb.DMatrix(data=trans_x_test, label=test_label_mul, feature_names=trans_columns) 

print(np.unique(train_label_mul).shape[0], np.unique(test_label_mul).shape[0], train_label_mul.shape, test_label_mul.shape)

6 6 (1033871, 1) (100000, 1)


In [32]:
xg_label_mul, evals_result = build_xgb_model(dtrain, dtest, params, iteration=1000)

[0]	train-merror:0.49944	test-merror:0.48353
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 100 rounds.
[50]	train-merror:0.49168	test-merror:0.47659
[100]	train-merror:0.48823	test-merror:0.47435
[150]	train-merror:0.48600	test-merror:0.47331
[200]	train-merror:0.48409	test-merror:0.47214
[250]	train-merror:0.48269	test-merror:0.47142
[300]	train-merror:0.48140	test-merror:0.47121
[350]	train-merror:0.48036	test-merror:0.47130
[400]	train-merror:0.47921	test-merror:0.47108
[450]	train-merror:0.47816	test-merror:0.47043
[500]	train-merror:0.47727	test-merror:0.47038
[550]	train-merror:0.47633	test-merror:0.47018
[600]	train-merror:0.47543	test-merror:0.46996
Stopping. Best iteration:
[532]	train-merror:0.47667	test-merror:0.46994

training cost time: 4256.379280328751


In [33]:
train_pred = xg_label_mul.predict(dtrain, ntree_limit=xg_label_mul.best_ntree_limit)
test_pred = xg_label_mul.predict(dtest, ntree_limit=xg_label_mul.best_ntree_limit)

train_pred.shape, test_pred.shape

((1033871,), (100000,))

In [34]:
train_scores = cal_classification_score(train_pred, dtrain.get_label())
test_scores = cal_classification_score(test_pred, dtest.get_label())

train_scores, test_scores

({'AccuracyScore': 0.5233,
  'RecallScore': 0.5044,
  'PrecisionScore': 0.3988,
  'F1Score': 0.4083},
 {'AccuracyScore': 0.5301,
  'RecallScore': 0.4927,
  'PrecisionScore': 0.4119,
  'F1Score': 0.4176})