In [1]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb 
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import log_loss
import warnings
warnings.filterwarnings('ignore')
import gc
from scipy import sparse

In [2]:
df_train = pd.read_csv("/usr/local/codeData/RS-J2/6--train.csv")
df_test = pd.read_csv("/usr/local/codeData/RS-J2/6--test.csv")
print('read data end')
df_train.drop(['Id'], axis = 1, inplace = True)
df_test.drop(['Id'], axis = 1, inplace = True)

df_test['Label'] = -1

data = pd.concat([df_train, df_test])
data = data.fillna(-1)
data.to_csv("/usr/local/codeData/RS-J2/6--data.csv", index = False)

read data end


In [3]:
continuous_feature = ['I'] * 13
continuous_feature = [col + str(i + 1) for i, col in enumerate(continuous_feature)]
print('continuous_feature',continuous_feature)
category_feature = ['C'] * 26
category_feature = [col + str(i + 1) for i, col in enumerate(category_feature)]
print('category_feature',category_feature)
# discrite one-hot encoding
print('begin one-hot:')
for col in category_feature:
    onehot_feats = pd.get_dummies(data[col], prefix = col)
    data.drop([col], axis = 1, inplace = True)
    data = pd.concat([data, onehot_feats], axis = 1)
print('one-hot end')

continuous_feature ['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']
category_feature ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']
begin one-hot:
one-hot end


In [4]:
train = data[data['Label'] != -1]
target = train.pop('Label')
test = data[data['Label'] == -1]
test.drop(['Label'], axis = 1, inplace = True)

print('split train and testset:')
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2022)

split train and testset:


In [5]:
 print('begin train gbdt:')
gbm = lgb.LGBMRegressor(objective='binary',
                        subsample= 0.8,
                        min_child_weight= 0.5,
                        colsample_bytree= 0.7,
                        num_leaves=100,
                        max_depth = 12,
                        learning_rate=0.05,
                        n_estimators=10,
                        )

gbm.fit(x_train, y_train,
        eval_set = [(x_train, y_train), (x_val, y_val)],
        eval_names = ['train', 'val'],
        eval_metric = 'binary_logloss',
        # early_stopping_rounds = 100,
        )
model = gbm.booster_

begin train gbdt:
[1]	train's binary_logloss: 0.507483	val's binary_logloss: 0.483406
[2]	train's binary_logloss: 0.49782	val's binary_logloss: 0.481068
[3]	train's binary_logloss: 0.487312	val's binary_logloss: 0.477497
[4]	train's binary_logloss: 0.47779	val's binary_logloss: 0.475343
[5]	train's binary_logloss: 0.469364	val's binary_logloss: 0.473924
[6]	train's binary_logloss: 0.460938	val's binary_logloss: 0.471526
[7]	train's binary_logloss: 0.452637	val's binary_logloss: 0.468564
[8]	train's binary_logloss: 0.445008	val's binary_logloss: 0.466346
[9]	train's binary_logloss: 0.437874	val's binary_logloss: 0.465221
[10]	train's binary_logloss: 0.431197	val's binary_logloss: 0.463057


In [6]:
print('train to get leaf:')
gbdt_feats_train = model.predict(train, pred_leaf = True)
gbdt_feats_test = model.predict(test, pred_leaf = True)
gbdt_feats_name = ['gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])]
df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns = gbdt_feats_name) 
print('df_train_gbdt_feats',df_train_gbdt_feats)
df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns = gbdt_feats_name)

train to get leaf:
df_train_gbdt_feats       gbdt_leaf_0  gbdt_leaf_1  gbdt_leaf_2  gbdt_leaf_3  gbdt_leaf_4  \
0              10           20           25           36            7   
1              16           40            7           15           17   
2               2           13            6            6           13   
3              39            4           36           35           34   
4              18           30            5            5           14   
...           ...          ...          ...          ...          ...   
1594           32           37           37           37           38   
1595           17           21           45           19           18   
1596           20           32           27           22            8   
1597           32           15           17           37           22   
1598           32            0           37           37           45   

      gbdt_leaf_5  gbdt_leaf_6  gbdt_leaf_7  gbdt_leaf_8  gbdt_leaf_9  
0           

In [7]:
print('create new dataset:')
train = pd.concat([train, df_train_gbdt_feats], axis = 1)
test = pd.concat([test, df_test_gbdt_feats], axis = 1)
train_len = train.shape[0]
data = pd.concat([train, test])
del train
del test
gc.collect()

create new dataset:


64

In [8]:
# leafs one-hot
print('begin one-hot:')
for col in gbdt_feats_name:
    print('this is feature:', col)
    onehot_feats = pd.get_dummies(data[col], prefix = col)
    data.drop([col], axis = 1, inplace = True)
    data = pd.concat([data, onehot_feats], axis = 1)
print('one-hot ending')

train = data[: train_len]
test = data[train_len:]
del data
gc.collect()

begin one-hot:
this is feature: gbdt_leaf_0
this is feature: gbdt_leaf_1
this is feature: gbdt_leaf_2
this is feature: gbdt_leaf_3
this is feature: gbdt_leaf_4
this is feature: gbdt_leaf_5
this is feature: gbdt_leaf_6
this is feature: gbdt_leaf_7
this is feature: gbdt_leaf_8
this is feature: gbdt_leaf_9
one-hot ending


0

In [9]:
x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.3, random_state = 2022)
    # lr
print('beging train lr:')
lr = LogisticRegression()
lr.fit(x_train, y_train)
tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])
print('tr-logloss: ', tr_logloss)
val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])
print('val-logloss: ', val_logloss)
print('begin predict:')
y_pred = lr.predict_proba(test)[:, 1]
print('write log:')
res = pd.read_csv("/usr/local/codeData/RS-J2/6--test.csv")
log = pd.DataFrame({'Id': res['Id'], 'Label': y_pred})
log.to_csv('log/log_gbdt+lr_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False)
print('end')

beging train lr:
tr-logloss:  0.5250307266747974
val-logloss:  0.5126283534188775
begin predict:
write log:
end
