In [1]:
import os
import gc
import math

import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler


from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

from tqdm import tqdm
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('testA.csv')
train.head()

Unnamed: 0,id,heartbeat_signals,label
0,0,"0.9912297987616655,0.9435330436439665,0.764677...",0.0
1,1,"0.9714822034884503,0.9289687459588268,0.572932...",0.0
2,2,"1.0,0.9591487564065292,0.7013782792997189,0.23...",2.0
3,3,"0.9757952826275774,0.9340884687738161,0.659636...",0.0
4,4,"0.0,0.055816398940721094,0.26129357194994196,0...",2.0


In [4]:
train = train.join(train['heartbeat_signals'].str.split(',').apply(pd.Series).astype('float'))
train.drop(columns='heartbeat_signals', inplace=True)
train = train.rename({i:f's_{i}' for i in range(205)}, axis=1)

test = test.join(test['heartbeat_signals'].str.split(',').apply(pd.Series).astype('float'))
test.drop(columns='heartbeat_signals', inplace=True)
test = test.rename({i:f's_{i}' for i in range(205)}, axis=1)

In [5]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 157.93 MB
Memory usage after optimization is: 39.67 MB
Decreased by 74.9%
Memory usage of dataframe is 31.43 MB
Memory usage after optimization is: 7.90 MB
Decreased by 74.9%


In [6]:
X_train = train.drop(columns=['id', 'label'])
y_train = train['label']
X_test = test.drop(columns='id')

In [23]:
seed = 23
params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'num_class': 4,
                'num_leaves': 2 ** 5,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': seed,
                'nthread': 28,
                'n_jobs':24,
                'verbose': -1,
            }

In [45]:
param_grid = {
        'num_leaves':(10, 200),
        'max_depth':(3, 20),
        'bagging_fraction':(0.5, 1.0),
        'feature_fraction':(0.5, 1.0),
        'bagging_freq':(0, 100),
        'min_data_in_leaf':(10,100),
        'min_child_weight':(0, 10),
        'min_split_gain':(0.0, 1.0),
        'reg_alpha':(0.0, 10),
        'reg_lambda':(0.0, 10),
    }

In [None]:
"""通过网格搜索确定最优参数"""
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score

# 设置5折交叉验证
cv_fold = KFold(n_splits=5, shuffle=True, random_state=2021)

model_lgb = lgb.LGBMClassifier(**params)

f1 = make_scorer(f1_score, average='micro')
grid_search = RandomizedSearchCV(estimator=model_lgb, param_distributions=param_grid, cv=cv_fold, scoring=f1)
grid_search.fit(X_train, y_train)

In [66]:
print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)

LGBMClassifier(bagging_fraction=1.0, bagging_freq=0, feature_fraction=0.5,
               max_depth=20, min_child_weight=10, min_data_in_leaf=10,
               n_jobs=24, nthread=28, num_class=4, num_leaves=200,
               objective='multiclass', reg_alpha=10, reg_lambda=10, seed=23,
               verbose=-1)
{'reg_lambda': 10, 'reg_alpha': 10, 'num_leaves': 200, 'min_split_gain': 0.0, 'min_data_in_leaf': 10, 'min_child_weight': 10, 'max_depth': 20, 'feature_fraction': 0.5, 'bagging_freq': 0, 'bagging_fraction': 1.0}
0.9808399999999999


In [61]:
random_results = pd.DataFrame(grid_search.cv_results_).drop(columns='params').sort_values('mean_test_score', ascending=False)
random_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_reg_lambda,param_reg_alpha,param_num_leaves,param_min_split_gain,param_min_data_in_leaf,param_min_child_weight,...,param_bagging_freq,param_bagging_fraction,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,6.814429,0.391619,0.25717,0.00518,10,10,200,0,10,10,...,0,1.0,0.98025,0.97985,0.98165,0.9799,0.98255,0.98084,0.001076,1
0,5.259064,0.034182,0.19089,0.00185,0,0,200,1,100,0,...,100,0.5,0.97675,0.97795,0.97805,0.976,0.97925,0.9776,0.001125,2
8,3.898224,0.175918,0.18658,0.006944,0,10,10,1,100,10,...,0,1.0,0.9676,0.96585,0.968,0.96575,0.96955,0.96735,0.001424,3
5,5.243801,0.086056,0.176331,0.000808,10,0,10,0,10,0,...,0,0.5,0.96655,0.96685,0.96775,0.96625,0.96925,0.96733,0.001083,4
6,3.846641,0.077165,0.158775,0.014486,0,0,200,1,100,0,...,0,0.5,0.95475,0.95645,0.95635,0.95615,0.95885,0.95651,0.001322,5
1,2.968158,0.06549,0.151794,0.004059,10,10,200,0,10,0,...,100,1.0,0.9541,0.95535,0.95735,0.95475,0.9585,0.95601,0.001653,6
3,3.894835,0.065135,0.152591,0.003278,0,10,10,1,10,10,...,0,0.5,0.9544,0.9553,0.9564,0.95525,0.9575,0.95577,0.001073,7
2,3.928005,0.146244,0.150397,0.005863,0,10,10,0,10,0,...,100,1.0,0.95405,0.95535,0.9558,0.9544,0.95805,0.95553,0.001409,8
4,3.966175,0.095525,0.153589,0.003732,0,10,200,0,10,10,...,0,0.5,0.9543,0.9553,0.95605,0.9549,0.9571,0.95553,0.000969,8
7,3.860762,0.107416,0.147906,0.005996,0,10,10,0,10,10,...,0,0.5,0.9543,0.9553,0.95605,0.9549,0.9571,0.95553,0.000969,8


In [67]:
model_lgb = grid_search.best_estimator_

In [70]:
param_grid = {'num_leaves': range(10, 200, 10), 'max_depth': range(3, 20, 5)}

In [None]:
grid_search = GridSearchCV(estimator=model_lgb, param_grid=param_grid, cv=cv_fold, scoring=f1)
grid_search.fit(X_train, y_train)

In [72]:
print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)

LGBMClassifier(bagging_fraction=1.0, bagging_freq=0, feature_fraction=0.5,
               max_depth=13, min_child_weight=10, min_data_in_leaf=10,
               n_jobs=24, nthread=28, num_class=4, num_leaves=130,
               objective='multiclass', reg_alpha=10, reg_lambda=10, seed=23,
               verbose=-1)
{'max_depth': 13, 'num_leaves': 130}
0.9810599999999999


In [73]:
model_lgb = grid_search.best_estimator_

In [74]:
model_lgb

LGBMClassifier(bagging_fraction=1.0, bagging_freq=0, feature_fraction=0.5,
               max_depth=13, min_child_weight=10, min_data_in_leaf=10,
               n_jobs=24, nthread=28, num_class=4, num_leaves=130,
               objective='multiclass', reg_alpha=10, reg_lambda=10, seed=23,
               verbose=-1)

In [None]:
lgb_score = cross_val_score(model_lgb, X_train, y_train, cv=5, scoring=f1)

In [78]:
lgb_score

array([0.98185, 0.9817 , 0.98115, 0.9812 , 0.98055])

In [156]:
param_grid = {
    'depth': [4, 7, 10],
    'learning_rate' : [0.03, 0.1, 0.15],
    'l2_leaf_reg': [1,4,9],
    'iterations': [300, 500, 1200],
    'early_stopping_rounds': [300],
    'task_type':['GPU'],
    'loss_function':['MultiClass'],
}

In [157]:
y_train = y_train.astype(int)

In [None]:
from catboost import CatBoostClassifier

model_catbst = CatBoostClassifier()
grid_search = RandomizedSearchCV(estimator=model_catbst, param_distributions=param_grid, cv=cv_fold, scoring=f1)
grid_search.fit(X_train, y_train)

In [160]:
print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)

<catboost.core.CatBoostClassifier object at 0x000001AF8D60FC50>
{'task_type': 'GPU', 'loss_function': 'MultiClass', 'learning_rate': 0.15, 'l2_leaf_reg': 9, 'iterations': 1200, 'early_stopping_rounds': 300, 'depth': 10}
0.98633


In [161]:
model_catbst = grid_search.best_estimator_

In [131]:
from sklearn.ensemble import RandomForestClassifier

In [145]:
param_grid = {
    'bootstrap': [True, False],
     'max_depth': range(1, 20),
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': range(5, 50, 5),
     'min_samples_split': range(2, 20, 2),
     'n_estimators': range(10, 300, 30),
             }

In [147]:
model_rfc = RandomForestClassifier()
grid_search = RandomizedSearchCV(estimator=model_rfc, param_distributions=param_grid, cv=cv_fold, n_jobs=-1, verbose=1, scoring=f1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 10.6min finished


RandomizedSearchCV(cv=KFold(n_splits=5, random_state=2021, shuffle=True),
                   estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': range(1, 20),
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': range(5, 50, 5),
                                        'min_samples_split': range(2, 20, 2),
                                        'n_estimators': range(10, 300, 30)},
                   scoring=make_scorer(f1_score, average=micro), verbose=1)

In [148]:
print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)

RandomForestClassifier(max_depth=16, min_samples_leaf=10, min_samples_split=14,
                       n_estimators=250)
{'n_estimators': 250, 'min_samples_split': 14, 'min_samples_leaf': 10, 'max_features': 'auto', 'max_depth': 16, 'bootstrap': True}
0.96485


In [153]:
model_rfc = grid_search.best_estimator_

In [187]:
pred_test_lgb = model_lgb.predict_proba(X_test)
pred_test_cat = model_catbst.predict_proba(X_test)
pred_test_rfc = model_rfc.predict_proba(X_test)

In [189]:
pred_test_all = 1/3 * pred_test_lgb + 1/3 * pred_test_cat + 1/3 * pred_test_rfc

In [184]:
# pred_test_all = np.where(pred_test_all==pred_test_all.max(axis=1, keepdims=1), 1, 0)

In [190]:
pred_test_all

array([[9.94342866e-01, 2.94359198e-03, 6.34052780e-04, 2.07948903e-03],
       [6.23273783e-03, 1.60086811e-02, 9.76923295e-01, 8.35285742e-04],
       [3.01286218e-03, 5.29911399e-05, 9.29845168e-03, 9.87635695e-01],
       ...,
       [2.48777223e-01, 8.74333133e-03, 7.31456412e-01, 1.10230340e-02],
       [9.94787842e-01, 3.27794097e-03, 1.07380088e-03, 8.60416512e-04],
       [8.14006901e-01, 3.39308022e-02, 7.38080572e-02, 7.82542396e-02]])

In [191]:
df = pd.read_csv('sample_submit.csv')

df[['label_0', 'label_1', 'label_2', 'label_3']] = pred_test_all

df.to_csv('submission_prob.csv', index=False)

# stacking

In [12]:
train = pd.read_csv('train.csv')
test = pd.read_csv('testA.csv')
train.head()

Unnamed: 0,id,heartbeat_signals,label
0,0,"0.9912297987616655,0.9435330436439665,0.764677...",0.0
1,1,"0.9714822034884503,0.9289687459588268,0.572932...",0.0
2,2,"1.0,0.9591487564065292,0.7013782792997189,0.23...",2.0
3,3,"0.9757952826275774,0.9340884687738161,0.659636...",0.0
4,4,"0.0,0.055816398940721094,0.26129357194994196,0...",2.0


In [13]:
train = train.join(train['heartbeat_signals'].str.split(',').apply(pd.Series).astype('float'))
train.drop(columns='heartbeat_signals', inplace=True)
train = train.rename({i:f's_{i}' for i in range(205)}, axis=1)

test = test.join(test['heartbeat_signals'].str.split(',').apply(pd.Series).astype('float'))
test.drop(columns='heartbeat_signals', inplace=True)
test = test.rename({i:f's_{i}' for i in range(205)}, axis=1)

In [14]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 157.93 MB
Memory usage after optimization is: 39.67 MB
Decreased by 74.9%
Memory usage of dataframe is 31.43 MB
Memory usage after optimization is: 7.90 MB
Decreased by 74.9%


In [15]:
train_X_data = train.drop(columns=['id', 'label'])
train_X_target = train['label'].astype(int)
X_test = test.drop(columns='id')

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
def stacking(model, train_data, train_target, test_data, n_fold):
    skf = StratifiedKFold(n_splits=n_fold, random_state=1)  # StratifiedKFold 默认分层采样
    train_pred = np.zeros((train_data.shape[0], 1), int)   # 存储训练集预测结果
    test_pred = np.zeros((test_data.shape[0], 1), int)  # 存储测试集预测结果 行数：len(test_data) ,列数：1列
    for skf_index, (train_index, val_index) in enumerate(skf.split(train_data, train_target)):
        print('第 ', skf_index+1, ' 折交叉验证开始... ')
        # 训练集划分
        x_train, x_val = train_data.iloc[train_index], train_data.iloc[val_index]
        y_train, y_val = train_target.iloc[train_index], train_target.iloc[val_index]
        # 模型构建
        y_train = np.ravel(y_train)   # 向量转成数组
        model.fit(X=x_train, y=y_train)
        # 模型预测
        accs = accuracy_score(y_val, model.predict(x_val))
        print('第 ', skf_index+1, ' 折交叉验证 :  accuracy ： ', accs)
 
        # 训练集预测结果
        val_pred = model.predict(x_val)
        for i in range(len(val_index)):
            train_pred[val_index[i]] = val_pred[i]
        # 保存测试集预测结果
        test_pred = np.column_stack((test_pred, model.predict(test_data)))  # 将矩阵按列合并
 
    test_pred_mean = np.mean(test_pred, axis=1)  # 按行计算均值(会出现小数)
    test_pred_mean = pd.DataFrame(test_pred_mean)   # 转成DataFrame
    test_pred_mean = test_pred_mean.apply(lambda x: round(x))  # 小数需要四舍五入成整数
    return np.ravel(test_pred_mean), train_pred

In [17]:
x_train, x_test, y_train, y_test = train_test_split(train_X_data.values, train_X_target.values, test_size=0.3, random_state=1)
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
x_test = pd.DataFrame(X_test.values)

# LGBMClassifier
rf = LGBMClassifier(bagging_fraction=1.0, bagging_freq=0, feature_fraction=0.5,
               max_depth=20, min_child_weight=10, min_data_in_leaf=10,
               n_jobs=24, nthread=28, num_class=4, num_leaves=200,
               objective='multiclass', reg_alpha=10, reg_lambda=10, seed=23,
               verbose=-1)
print('==============================LGBMClassifier==============================')
rf_test_pred, rf_train_pred = stacking(model=rf, train_data=x_train, train_target=y_train, test_data=x_test, n_fold=5)
rf_test_pred = pd.DataFrame(rf_test_pred)
print(rf_train_pred)
rf_train_pred = pd.DataFrame(rf_train_pred)

# CatBoostClassifier
dt = CatBoostClassifier(task_type='GPU', loss_function='MultiClass', learning_rate=0.15, 
                        l2_leaf_reg=9, iterations=600, early_stopping_rounds=300, depth=10)
print('==============================CatBoostClassifier==============================')
dt_test_pred, dt_train_pred = stacking(model=dt, train_data=x_train, train_target=y_train, test_data=x_test, n_fold=5)
dt_test_pred = pd.DataFrame(dt_test_pred)
dt_train_pred = pd.DataFrame(dt_train_pred)

# RandomForestClassifier
knn = RandomForestClassifier(max_depth=16, min_samples_leaf=10, min_samples_split=14,
                       n_estimators=250)
print('==============================RandomForestClassifier==============================')
knn_test_pred, knn_train_pred = stacking(model=knn, train_data=x_train, train_target=y_train, test_data=x_test, n_fold=5)
knn_test_pred = pd.DataFrame(knn_test_pred)
knn_train_pred = pd.DataFrame(knn_train_pred)

# rf_train_pred,dt_train_pred,knn_train_pred 合并生成次级训练集 train_set
# rf_test_pred,dt_test_pred,knn_test_pred 合并生成次级测试集集 test_set
train_set = pd.concat([rf_train_pred, dt_train_pred, knn_train_pred],  axis=1)
test_set = pd.concat([rf_test_pred, dt_test_pred, knn_test_pred],  axis=1)

clf = LogisticRegression(solver='lbfgs')
clf.fit(train_set, y_train)
y_submission = clf.predict(test_set)

第  1  折交叉验证开始... 
第  1  折交叉验证 :  accuracy ：  0.9772857142857143
第  2  折交叉验证开始... 
第  2  折交叉验证 :  accuracy ：  0.9775714285714285
第  3  折交叉验证开始... 
第  3  折交叉验证 :  accuracy ：  0.9797857142857143
第  4  折交叉验证开始... 
第  4  折交叉验证 :  accuracy ：  0.9778571428571429
第  5  折交叉验证开始... 
第  5  折交叉验证 :  accuracy ：  0.979
[[2]
 [2]
 [3]
 ...
 [3]
 [0]
 [2]]
第  1  折交叉验证开始... 
0:	learn: 1.0708810	total: 82ms	remaining: 49.1s
1:	learn: 0.8815917	total: 158ms	remaining: 47.2s
2:	learn: 0.7482107	total: 242ms	remaining: 48.1s
3:	learn: 0.6493685	total: 321ms	remaining: 47.9s
4:	learn: 0.5729787	total: 399ms	remaining: 47.4s
5:	learn: 0.5116835	total: 476ms	remaining: 47.1s
6:	learn: 0.4614295	total: 559ms	remaining: 47.4s
7:	learn: 0.4188251	total: 641ms	remaining: 47.4s
8:	learn: 0.3822514	total: 722ms	remaining: 47.4s
9:	learn: 0.3522395	total: 799ms	remaining: 47.1s
10:	learn: 0.3266627	total: 879ms	remaining: 47s
11:	learn: 0.3043195	total: 953ms	remaining: 46.7s
12:	learn: 0.2855765	total: 1.03s	remain

In [18]:
y_submission

array([0, 2, 2, ..., 2, 0, 0])

In [21]:
np.eye(4)[y_submission].shape

(20000, 4)

In [22]:
df = pd.read_csv('sample_submit.csv')

df[['label_0', 'label_1', 'label_2', 'label_3']] = np.eye(4)[y_submission]

df.to_csv('submission_prob.csv', index=False)