In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier
from scipy import stats
import warnings
import lightgbm
import catboost
warnings.filterwarnings('ignore')
print('lgb', lightgbm.__version__)# 使用2.3.0
print('cbt', catboost.__version__)# 使用0.18

lgb 2.3.0
cbt 0.18.1


In [4]:
def read_data(model, use_P9=True):
    # 读取数据
    print('Reading data ...')
    data_1 = pd.read_csv(path + 'first_round_training_data.csv')
    data_2 = pd.read_csv(path + 'second_round_training_data.csv')
    test_data = pd.read_csv(path + 'second_round_testing_data.csv')
    data = data_1.append(data_2)
    data.index = range(data.shape[0])
    #丢弃Parameter1-4， Attribute1-3
    drop_columns_P = ['Parameter%d'%x for x in range(1, 5)]
    drop_columns_A = ['Attribute%d'%x for x in range(1, 4)]
    data.drop(drop_columns_P + drop_columns_A, axis=1, inplace=True)
    test_data.drop(drop_columns_P, axis=1, inplace=True)
    
    #按照用Parameter9、不用Parameter9分别训练两个catboost模型，lgb模型不使用Parameter9
    if model == 'cbt':
        if not use_P9:
            test_data = test_data[test_data.Parameter9.isnull()]
            test_data.drop('Parameter9', inplace=True, axis=1)
            data.drop('Parameter9', inplace=True, axis=1)
        else:
            test_data = test_data[~test_data.Parameter9.isnull()]
            
    elif model == 'lgb':
        data.drop('Parameter9', axis=1, inplace=True)
        test_data.drop('Parameter9', axis=1, inplace=True)
        
    else:
        print('not supported model!')
    return data, test_data

def preprocessing(model, data, test_data):
    #数据预处理
    print('Preprocessing ...')
    transform_dict = {'Pass': 2, 'Fail': 3, 'Excellent': 0, 'Good': 1}
    data['Quality_label'] = data['Quality_label'].map(transform_dict)
    #对Attribute特征进行boxcox变换
    for i in range(4, 11):
        data['Attribute%d'%i] = stats.boxcox(data['Attribute%d'%i])[0]
        
    #添加新特征
    if model == 'cbt':
        data['Parameter11'] = data.Parameter5 + 1.05 * data.Parameter6
        test_data['Parameter11'] = test_data.Parameter5 + 1.05 * test_data.Parameter6
    return data, test_data

def train_val_split(data):
    print('Train test spliting ...')
    split = StratifiedShuffleSplit(n_splits=1, test_size=.1, random_state=2019)
    for train_ix, val_ix in split.split(data, data['Quality_label']):
        train_data = data.iloc[train_ix].copy()
        val_data = data.iloc[val_ix].copy()
    
    return train_data, val_data

def generate_Attribute_features(model, train_data, val_data, test_data):
    #利用Parameter预测Attribute4-10
    print('Generating attribute features ...')
    if model == 'cbt':
        for col in Attribute_list:
            cat_reg = CatBoostRegressor(iterations=5000,
                                        learning_rate=0.03,
                                        verbose=False,
                                        loss_function='RMSE', 
                                        task_type='CPU',
                                        bootstrap_type='MVS')
            cat_reg.fit(train_data[Parameter_list],
                        train_data[col],
                        eval_set=(val_data[Parameter_list], val_data[col]),
                        early_stopping_rounds=300,
                        use_best_model=True)
            print(cat_reg.get_all_params())
            val_data[col] = cat_reg.predict(val_data[Parameter_list])
            train_data[col] = cat_reg.predict(train_data[Parameter_list])
            test_data[col] = cat_reg.predict(test_data[Parameter_list])
        return train_data, val_data, test_data
    
    elif model == 'lgb':
        for col in Attribute_list:
            cat_reg = CatBoostRegressor(iterations=1500,
                                        learning_rate=0.03,
                                        loss_function='RMSE',
                                        verbose=False,
                                        task_type="CPU",
                                        random_state=0,
                                        boosting_type='Ordered',
                                        boost_from_average=False,
                                        bootstrap_type='Bayesian',
                                        )
            cat_reg.fit(train_data[Parameter_list], train_data[col])
            train_data[col] = cat_reg.predict(train_data[Parameter_list])
            test_data[col] = cat_reg.predict(test_data[Parameter_list])
        return train_data, test_data

def modeling(model):
    #训练模型
    print('Training model ...')
    if model == 'cbt':
        train_y, val_y = train_data.Quality_label, val_data.Quality_label
        cols = Parameter_list + ['Parameter11'] + Attribute_list
        cat_clf = CatBoostClassifier(iterations=3000,
                                     learning_rate=0.01,
                                     verbose=False,
                                     task_type='CPU',
                                     loss_function='MultiClass')
        cat_clf.fit(train_data[cols],
                    train_y,
                    eval_set=(val_data[cols], val_y),
                    early_stopping_rounds=500,
                    use_best_model=True)
        res = pd.DataFrame(cat_clf.predict_proba(test_data[cols]), index=test_data.index)
    elif model == 'lgb':
        lgb_clf = LGBMClassifier(boosting_type='gbdt',
                                 objective='multiclass',
                                 n_estimators=1600,
                                 num_class=4,
                                 learning_rate=0.01)
        cols = Parameter_list + Attribute_list
        train_y = train_data.Quality_label
        lgb_clf.fit(train_data[cols], train_y)
        res = pd.DataFrame(lgb_clf.predict_proba(test_data[cols]), index=test_data.index)
    
    return res

def results(model, res1, res2):
    #拼接测试集预测结果，并按组合并
    print('Generating results ...')
    if model == 'cbt':
        res = res1.append(res2).sort_index().values
    elif model == 'lgb':
        res = res1
    
    result = []
    for i in range(0, 6000, 50):
        mat = res[i:i + 50]
        mean_ = np.mean(mat, axis=0)
        result.append(list(mean_))

    result = pd.DataFrame(result,
                          columns=['Excellent ratio', 'Good ratio', 'Pass ratio', 'Fail ratio'])
    return result

In [None]:
'''----------------------cbt---------------------------'''
'''不使用Parameter9训练，预测P9为空的test_data'''
print('--------------catboost: drop P9--------------')
path = 'dataSets/'
model = 'cbt'
data, test_data = read_data(model=model, use_P9=False)
data, test_data = preprocessing(model=model, data=data, test_data=test_data)
train_data, val_data= train_val_split(data)
Attribute_list = [x for x in train_data.columns if x[0] == 'A']
Parameter_list = [x for x in train_data.columns if (x[0] == 'P') & (x != 'Parameter11')]
train_data, val_data, test_data = generate_Attribute_features(model=model, train_data=train_data, test_data=test_data, val_data=val_data)
res1 = modeling(model=model)
print('catboost model 1 is finished!')
'''使用Parameter9训练，预测P9不为空的test_data'''
print('--------------catboost: use P9--------------')
data, test_data = read_data(model=model, use_P9=True)
data, test_data = preprocessing(model=model, data=data, test_data=test_data)
train_data, val_data= train_val_split(data)
Attribute_list = [x for x in train_data.columns if x[0] == 'A']
Parameter_list = [x for x in train_data.columns if (x[0] == 'P') & (x != 'Parameter11')]
train_data, val_data, test_data = generate_Attribute_features(model=model, train_data=train_data, test_data=test_data, val_data=val_data)
res2 = modeling(model=model)
cbt_result = results(model=model, res1=res1, res2=res2)
print('catboost model 2 is finished!')
'''---------------------lgb--------------------------------'''
print('-----------------lgb: drop P9---------------')
model = 'lgb'
Parameter_list = ['Parameter%d'%x for x in [5, 6, 7, 8, 10]]
Attribute_list = ['Attribute%d'%x for x in range(4, 11)]
train_data, test_data = read_data(model=model, use_P9=False)
train_data, test_data = preprocessing(model=model, data=train_data, test_data=test_data)
train_data, test_data = generate_Attribute_features(model=model, train_data=train_data, test_data=test_data, val_data=None)
res = modeling(model=model)
lgb_result = results(model=model, res1=res, res2=None)

final_result = lgb_result / 2 + cbt_result / 2
final_result.to_csv('res.csv')

--------------catboost: drop P9--------------
Reading data ...
Preprocessing ...
Train test spliting ...
Generating attribute features ...
{'nan_mode': 'Min', 'eval_metric': 'RMSE', 'iterations': 5000, 'sampling_frequency': 'PerTree', 'fold_permutation_block': 0, 'leaf_estimation_method': 'Newton', 'od_pval': 0, 'boosting_type': 'Plain', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.1000000015, 'l2_leaf_reg': 3, 'random_strength': 1, 'od_type': 'Iter', 'rsm': 1, 'boost_from_average': True, 'model_size_reg': 0.5, 'approx_on_full_history': False, 'subsample': 0.8000000119, 'use_best_model': True, 'od_wait': 300, 'random_seed': 0, 'depth': 6, 'has_time': False, 'fold_len_multiplier': 2, 'border_count': 254, 'classes_count': 0, 'sparse_features_conflict_fraction': 0, 'leaf_estimation_backtracking': 'AnyImprovement', 'best_model_min_trees': 1, 'model_shrink_rate': 0, 'loss_function': 'RMSE', 'learning_rate': 0.02999999933, 'score_function': 'Cosine', 'task_type': 'CPU', 'l

In [None]:
 boosting_type='Ordered',boost_from_average=False, bootstrap_type='Bayesian',