In [1]:
import pandas as pd
import numpy as np
from tslearn.metrics import dtw, dtw_path, dtw_path_from_metric
from pandas.core.frame import DataFrame

import os
import gc
import math

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler


from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

from collections import Counter
from tsfresh import extract_features,select_features
from tsfresh.utilities.dataframe_functions import impute
from tqdm import tqdm
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

### 数据

In [2]:
train_csv = './train.csv' 
testA_csv = './testA.csv'
ts_test = './ts_test.pkl'

In [3]:
train = pd.read_csv(train_csv)
test = pd.read_csv(testA_csv)
ts_test = pd.read_pickle(ts_test)

* 训练集

In [4]:
train.head()

Unnamed: 0,id,heartbeat_signals,label
0,0,"0.9912297987616655,0.9435330436439665,0.764677...",0.0
1,1,"0.9714822034884503,0.9289687459588268,0.572932...",0.0
2,2,"1.0,0.9591487564065292,0.7013782792997189,0.23...",2.0
3,3,"0.9757952826275774,0.9340884687738161,0.659636...",0.0
4,4,"0.0,0.055816398940721094,0.26129357194994196,0...",2.0


* 测试集(不含label，毕竟用来预测嘛)

In [5]:
test.head()

Unnamed: 0,id,heartbeat_signals
0,100000,"0.9915713654170097,1.0,0.6318163407681274,0.13..."
1,100001,"0.6075533139615096,0.5417083883163654,0.340694..."
2,100002,"0.9752726292239277,0.6710965234906665,0.686758..."
3,100003,"0.9956348033996116,0.9170249621481004,0.521096..."
4,100004,"1.0,0.8879490481178918,0.745564725322326,0.531..."


* Tsfresh提出的特征数据：'label'=-1 为测试集数据

In [6]:
impute(ts_test)
ts_test.head()

Unnamed: 0,heartbeat_signals__variance_larger_than_standard_deviation,heartbeat_signals__has_duplicate_max,heartbeat_signals__has_duplicate_min,heartbeat_signals__has_duplicate,heartbeat_signals__sum_values,heartbeat_signals__abs_energy,heartbeat_signals__mean_abs_change,heartbeat_signals__mean_change,heartbeat_signals__mean_second_derivative_central,heartbeat_signals__median,...,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1,heartbeat_signals__query_similarity_count__query_None__threshold_0.0,"heartbeat_signals__matrix_profile__feature_""min""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""max""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""mean""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""median""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""25""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""75""__threshold_0.98",label
0,0.0,0.0,1.0,1.0,38.927945,18.216197,0.019894,-0.004859,0.000117,0.125531,...,2.500658,2.722686,0.0,6.445546,12.165525,10.246524,10.746992,8.388625,11.48491,0.0
1,0.0,0.0,1.0,1.0,19.445634,7.705092,0.019952,-0.004762,0.000105,0.030481,...,3.065802,3.224835,0.0,3.20914,12.649111,9.031069,9.437545,6.72318,12.094899,0.0
2,0.0,0.0,1.0,1.0,21.192974,9.140423,0.009863,-0.004902,0.000101,0.0,...,1.406001,1.509478,0.0,3.054539,8.246211,7.370478,8.246211,5.966122,8.246211,2.0
3,0.0,0.0,1.0,1.0,42.113066,15.757623,0.018743,-0.004783,0.000103,0.241397,...,3.534354,3.854177,0.0,3.010557,9.797959,6.33136,6.40644,5.266743,7.091706,0.0
4,0.0,0.0,1.0,1.0,69.756786,51.229616,0.014514,0.0,-0.000137,0.0,...,2.165627,2.323993,0.0,9.181236,13.429784,9.959913,9.51629,9.286013,10.270925,2.0


- 将Tsfresh库弄出来的数据切割分出来为   
train_features 10w 行的训练集(不带label)  这里用select_features与label进行了相关性过滤了一下

In [7]:
train_features = select_features(ts_test.iloc[:100000,:787],ts_test['label'].iloc[:100000])

In [8]:
train_features.head()

Unnamed: 0,heartbeat_signals__sum_values,"heartbeat_signals__fft_coefficient__attr_""abs""__coeff_38","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_37","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_36","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_35","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_34","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_33","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_32","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_31","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_30",...,"heartbeat_signals__fft_coefficient__attr_""abs""__coeff_84","heartbeat_signals__fft_coefficient__attr_""imag""__coeff_97","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_90","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_94","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_92","heartbeat_signals__fft_coefficient__attr_""real""__coeff_97","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_75","heartbeat_signals__fft_coefficient__attr_""real""__coeff_88","heartbeat_signals__fft_coefficient__attr_""real""__coeff_92","heartbeat_signals__fft_coefficient__attr_""real""__coeff_83"
0,38.927945,0.660949,1.090709,0.848728,1.168685,0.982133,1.223496,1.2363,1.104172,1.497129,...,0.531883,-0.047438,0.55437,0.307586,0.564596,0.56296,0.591859,0.504124,0.52845,0.473568
1,19.445634,1.718217,1.280923,1.850706,1.460752,1.924501,1.925485,1.715938,2.079957,1.818636,...,0.56359,-0.109579,0.697446,0.398073,0.640969,0.270192,0.224925,0.645082,0.635135,0.297325
2,21.192974,1.814281,1.619051,1.215343,1.787166,2.146987,1.68619,1.540137,2.291031,2.403422,...,0.712487,-0.074042,0.321703,0.390386,0.716929,0.316524,0.422077,0.722742,0.68059,0.383754
3,42.113066,2.10955,0.619634,2.366413,2.071539,1.00034,2.728281,1.391727,2.017176,2.610492,...,0.601499,-0.184248,0.564669,0.623353,0.46698,0.651774,0.308915,0.550097,0.466904,0.494024
4,69.756786,0.194549,0.348882,0.092119,0.653924,0.231422,1.080003,0.711244,1.357904,1.237998,...,0.015292,0.070505,0.065835,0.05178,0.09294,0.103773,0.179405,-0.089611,0.091841,0.056867


In [9]:
# 对全部的ts_test进行相应的操作
train_cha = list(set(ts_test.columns) - set(train_features.columns))
train_cha.remove('label')
ts_test = ts_test.drop(train_cha,axis=1)

In [10]:
ts_test.head()

Unnamed: 0,heartbeat_signals__has_duplicate_max,heartbeat_signals__has_duplicate_min,heartbeat_signals__sum_values,heartbeat_signals__abs_energy,heartbeat_signals__mean_abs_change,heartbeat_signals__mean_change,heartbeat_signals__mean_second_derivative_central,heartbeat_signals__median,heartbeat_signals__mean,heartbeat_signals__standard_deviation,...,heartbeat_signals__permutation_entropy__dimension_5__tau_1,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1,"heartbeat_signals__matrix_profile__feature_""min""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""max""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""mean""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""median""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""25""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""75""__threshold_0.98",label
0,0.0,1.0,38.927945,18.216197,0.019894,-0.004859,0.000117,0.125531,0.189892,0.229783,...,2.18442,2.500658,2.722686,6.445546,12.165525,10.246524,10.746992,8.388625,11.48491,0.0
1,0.0,1.0,19.445634,7.705092,0.019952,-0.004762,0.000105,0.030481,0.094857,0.16908,...,2.710933,3.065802,3.224835,3.20914,12.649111,9.031069,9.437545,6.72318,12.094899,0.0
2,0.0,1.0,21.192974,9.140423,0.009863,-0.004902,0.000101,0.0,0.10338,0.184119,...,1.26337,1.406001,1.509478,3.054539,8.246211,7.370478,8.246211,5.966122,8.246211,2.0
3,0.0,1.0,42.113066,15.757623,0.018743,-0.004783,0.000103,0.241397,0.20543,0.186186,...,2.986728,3.534354,3.854177,3.010557,9.797959,6.33136,6.40644,5.266743,7.091706,0.0
4,0.0,1.0,69.756786,51.229616,0.014514,0.0,-0.000137,0.0,0.340277,0.366213,...,1.914511,2.165627,2.323993,9.181236,13.429784,9.959913,9.51629,9.286013,10.270925,2.0


In [11]:
ts_test.head()

Unnamed: 0,heartbeat_signals__has_duplicate_max,heartbeat_signals__has_duplicate_min,heartbeat_signals__sum_values,heartbeat_signals__abs_energy,heartbeat_signals__mean_abs_change,heartbeat_signals__mean_change,heartbeat_signals__mean_second_derivative_central,heartbeat_signals__median,heartbeat_signals__mean,heartbeat_signals__standard_deviation,...,heartbeat_signals__permutation_entropy__dimension_5__tau_1,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1,"heartbeat_signals__matrix_profile__feature_""min""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""max""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""mean""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""median""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""25""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""75""__threshold_0.98",label
0,0.0,1.0,38.927945,18.216197,0.019894,-0.004859,0.000117,0.125531,0.189892,0.229783,...,2.18442,2.500658,2.722686,6.445546,12.165525,10.246524,10.746992,8.388625,11.48491,0.0
1,0.0,1.0,19.445634,7.705092,0.019952,-0.004762,0.000105,0.030481,0.094857,0.16908,...,2.710933,3.065802,3.224835,3.20914,12.649111,9.031069,9.437545,6.72318,12.094899,0.0
2,0.0,1.0,21.192974,9.140423,0.009863,-0.004902,0.000101,0.0,0.10338,0.184119,...,1.26337,1.406001,1.509478,3.054539,8.246211,7.370478,8.246211,5.966122,8.246211,2.0
3,0.0,1.0,42.113066,15.757623,0.018743,-0.004783,0.000103,0.241397,0.20543,0.186186,...,2.986728,3.534354,3.854177,3.010557,9.797959,6.33136,6.40644,5.266743,7.091706,0.0
4,0.0,1.0,69.756786,51.229616,0.014514,0.0,-0.000137,0.0,0.340277,0.366213,...,1.914511,2.165627,2.323993,9.181236,13.429784,9.959913,9.51629,9.286013,10.270925,2.0


### 提取「训练集」和「测试集」中的数据

In [12]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [13]:
train_list = []

for items in train.values:
    train_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])

train2 = pd.DataFrame(np.array(train_list))
train2.columns = ['id'] + ['s_'+str(i) for i in range(len(train_list[0])-2)] + ['label']
train2 = reduce_mem_usage(train2)

test_list=[]
for items in test.values:
    test_list.append([items[0]] + [float(i) for i in items[1].split(',')])

test2 = pd.DataFrame(np.array(test_list))
test2.columns = ['id'] + ['s_'+str(i) for i in range(len(test_list[0])-1)]
test2 = reduce_mem_usage(test2)


Memory usage of dataframe is 157.93 MB
Memory usage after optimization is: 39.67 MB
Decreased by 74.9%
Memory usage of dataframe is 31.43 MB
Memory usage after optimization is: 7.90 MB
Decreased by 74.9%


### 特征工程
#### 特征构建

In [14]:
def  psfeatureTime(data):
    #data = pd.Series(data)
    # 均值
    df_mean = data.mean(axis=1)
    # 方差
    df_var = data.var(axis=1)
    # 标准差
    df_std = data.std(axis=1)
    # 均方根
    df_rms = pow((pow(df_mean, 2) + pow(df_std, 2)),0.5)
    # 偏度
    df_skew = data.skew(axis=1)
    # 峭度
#    df_kurt = data.kurt(axis=1)

#     df_min = data.min(axis=1)
#     df_max = data.max(axis=1)
    df_sum = data.sum(axis=1)
    
    column = ['df_mean', 'df_var', 'df_std', 'df_rms', 'df_skew' ,'df_sum']
    featuretime_list = pd.concat([df_mean, df_var, df_std, df_rms, df_skew ,df_sum],axis=1)
    
    featuretime_list.columns = column


    return featuretime_list

In [15]:
train_time_feature = psfeatureTime(train2.drop(['id','label'],axis=1))

In [16]:
train2 = pd.concat([train2,train_time_feature],axis=1)

In [17]:
test_time_feature = psfeatureTime(test2.drop(['id'],axis=1))

In [18]:
test2 = pd.concat([test2,test_time_feature],axis=1)

In [19]:
test2

Unnamed: 0,id,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,...,s_201,s_202,s_203,s_204,df_mean,df_var,df_std,df_rms,df_skew,df_sum
0,100000.0,0.991699,1.000000,0.631836,0.136230,0.041412,0.102722,0.120850,0.123413,0.107910,...,0.000000,0.000000,0.000000,0.00000,0.093872,0.029434,0.171509,0.195557,3.527344,19.234375
1,100001.0,0.607422,0.541504,0.340576,0.000000,0.090698,0.164917,0.195068,0.168823,0.198853,...,0.350586,0.350586,0.350586,0.36377,0.411865,0.017822,0.133545,0.433105,1.609375,84.312500
2,100002.0,0.975098,0.670898,0.686523,0.708496,0.718750,0.716797,0.720703,0.701660,0.596680,...,0.000000,0.000000,0.000000,0.00000,0.232788,0.049927,0.223389,0.322754,0.729004,47.781250
3,100003.0,0.995605,0.916992,0.520996,0.000000,0.221802,0.404053,0.490479,0.527344,0.518066,...,0.000000,0.000000,0.000000,0.00000,0.229370,0.087646,0.296143,0.374756,0.702637,47.062500
4,100004.0,1.000000,0.888184,0.745605,0.531738,0.380371,0.224609,0.091125,0.057648,0.003914,...,0.000000,0.000000,0.000000,0.00000,0.121521,0.035370,0.188110,0.223999,2.562500,24.906250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,119995.0,1.000000,0.833008,0.634277,0.639160,0.624023,0.598145,0.613770,0.624023,0.628906,...,0.000000,0.000000,0.000000,0.00000,0.210205,0.048676,0.220581,0.304688,0.874023,43.187500
19996,119996.0,1.000000,0.826172,0.452148,0.082214,0.000000,0.137085,0.201050,0.165649,0.158081,...,0.000000,0.000000,0.000000,0.00000,0.151123,0.047913,0.218872,0.265869,1.446289,31.031250
19997,119997.0,0.951660,0.916504,0.667480,0.352051,0.255371,0.197388,0.173584,0.141968,0.134521,...,0.000000,0.000000,0.000000,0.00000,0.154175,0.040100,0.200195,0.252686,1.541992,31.656250
19998,119998.0,0.927734,0.677246,0.242920,0.055359,0.102112,0.072266,0.021011,0.038300,0.048553,...,0.000000,0.000000,0.000000,0.00000,0.094299,0.023376,0.152954,0.179688,3.144531,19.312500


#### 特征选择  
all_features 是tsfresh处理700多维经过相关性,方差,树模型过滤后的数据 shape(12000,44)

In [20]:
all_features = pd.read_pickle('./205_0.5_300_62.pkl')

In [21]:
all_features.head()

Unnamed: 0,heartbeat_signals__sum_values,heartbeat_signals__abs_energy,heartbeat_signals__skewness,heartbeat_signals__kurtosis,heartbeat_signals__absolute_sum_of_changes,heartbeat_signals__longest_strike_below_mean,heartbeat_signals__longest_strike_above_mean,heartbeat_signals__count_above_mean,heartbeat_signals__count_below_mean,heartbeat_signals__sum_of_reoccurring_values,...,"heartbeat_signals__fft_coefficient__attr_""angle""__coeff_6","heartbeat_signals__fft_coefficient__attr_""angle""__coeff_8","heartbeat_signals__fft_aggregated__aggtype_""centroid""","heartbeat_signals__fft_aggregated__aggtype_""variance""","heartbeat_signals__fft_aggregated__aggtype_""kurtosis""",heartbeat_signals__value_count__value_0,"heartbeat_signals__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC""",heartbeat_signals__number_crossing_m__m_0,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1
0,38.927945,18.216197,1.349485,1.908603,4.058359,92.0,72.0,95.0,110.0,6.827155,...,70.470093,145.307714,20.048266,640.867764,5.659187,93.0,15.0,3.0,2.500658,2.722686
1,19.445634,7.705092,3.663488,15.174346,4.070173,98.0,44.0,82.0,123.0,2.312404,...,13.357471,35.674928,24.557446,657.447951,6.283027,84.0,4.0,3.0,3.065802,3.224835
2,21.192974,9.140423,1.841456,3.868159,2.012112,148.0,51.0,55.0,150.0,4.634713,...,-87.961613,-15.800108,25.010491,723.128806,5.731839,149.0,0.0,3.0,1.406001,1.509478
3,42.113066,15.757623,1.401586,4.354385,3.823527,60.0,102.0,121.0,84.0,8.769288,...,92.320874,-0.300105,21.766021,672.013235,5.526573,61.0,4.0,3.0,3.534354,3.854177
4,69.756786,51.229616,0.254199,-1.761625,2.960919,105.0,97.0,97.0,108.0,0.0,...,170.81431,-163.080746,8.833336,276.246018,11.70839,106.0,15.0,2.0,2.165627,2.323993


In [22]:
train2 = pd.concat([train2,all_features.iloc[:100000,:]],axis = 1)

In [23]:
train2

Unnamed: 0,id,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,...,"heartbeat_signals__fft_coefficient__attr_""angle""__coeff_6","heartbeat_signals__fft_coefficient__attr_""angle""__coeff_8","heartbeat_signals__fft_aggregated__aggtype_""centroid""","heartbeat_signals__fft_aggregated__aggtype_""variance""","heartbeat_signals__fft_aggregated__aggtype_""kurtosis""",heartbeat_signals__value_count__value_0,"heartbeat_signals__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC""",heartbeat_signals__number_crossing_m__m_0,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1
0,0.0,0.991211,0.943359,0.764648,0.618652,0.379639,0.190796,0.040222,0.026001,0.031708,...,70.470093,145.307714,20.048266,640.867764,5.659187,93.0,15.0,3.0,2.500658,2.722686
1,1.0,0.971680,0.929199,0.572754,0.178467,0.122986,0.132324,0.094421,0.089600,0.030487,...,13.357471,35.674928,24.557446,657.447951,6.283027,84.0,4.0,3.0,3.065802,3.224835
2,2.0,1.000000,0.958984,0.701172,0.231812,0.000000,0.080688,0.128418,0.187500,0.280762,...,-87.961613,-15.800108,25.010491,723.128806,5.731839,149.0,0.0,3.0,1.406001,1.509478
3,3.0,0.975586,0.934082,0.659668,0.249878,0.237061,0.281494,0.249878,0.249878,0.241455,...,92.320874,-0.300105,21.766021,672.013235,5.526573,61.0,4.0,3.0,3.534354,3.854177
4,4.0,0.000000,0.055817,0.261230,0.359863,0.433105,0.453613,0.499023,0.542969,0.616699,...,170.814310,-163.080746,8.833336,276.246018,11.708390,106.0,15.0,2.0,2.165627,2.323993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995.0,1.000000,0.677734,0.222412,0.257080,0.204712,0.054657,0.026154,0.118164,0.244873,...,-102.974924,67.069934,21.635824,725.527235,4.655643,53.0,2.0,3.0,3.391830,3.679969
99996,99996.0,0.926758,0.906250,0.637207,0.415039,0.374756,0.382568,0.358887,0.341309,0.336426,...,-70.331800,-83.949604,17.867152,619.474051,5.538888,32.0,14.0,3.0,3.728881,4.095457
99997,99997.0,0.925781,0.587402,0.633301,0.632324,0.639160,0.614258,0.599121,0.517578,0.403809,...,-71.893787,-62.051662,22.520028,836.224067,4.528923,77.0,0.0,1.0,2.996962,3.293562
99998,99998.0,1.000000,0.994629,0.829590,0.458252,0.264160,0.240234,0.213745,0.189331,0.203857,...,-57.040559,-63.885977,22.078285,686.549262,5.500729,47.0,2.0,1.0,3.793512,4.018302


In [24]:
temp = all_features.iloc[100000:,:]

In [25]:
temp = temp.reset_index(drop=True)

In [26]:
test2 = pd.concat([test2,temp],axis = 1)

In [27]:
test2['df_std']

0        0.171509
1        0.133545
2        0.223389
3        0.296143
4        0.188110
           ...   
19995    0.220581
19996    0.218872
19997    0.200195
19998    0.152954
19999    0.217529
Name: df_std, Length: 20000, dtype: float16

In [28]:
Counter(train2['label'])

Counter({0.0: 64327, 2.0: 14199, 3.0: 17912, 1.0: 3562})

In [29]:
# ts_train = ts_test[ts_test['label']>-1]
# ts_train.head()

In [30]:
# Counter(ts_train['label'])

In [31]:
# ts_test1 = ts_test[ts_test['label']==-1]
# ts_test1 = ts_test1.drop(['label'], axis=1)
# ts_test1.head()   # ts_test1 为tsfres-->相关性选择后的测试集 5rows * 707columns

### 留2/10作为本地测试样本

In [32]:
#ratio为[0, 1]
def data_split(data, ratio):
    cnt = len(data)
    num = int(cnt * ratio)
    
    loc_train = data.iloc[num:]
    loc_test = data.iloc[:num]
    
    return loc_train, loc_test

####  原始数据保留2/10

In [33]:
# # train2_train, train2_test = data_split(train2, 0.2)
# data_split(train)

In [34]:
# train2_test

In [35]:
# train2_train

In [36]:
# Counter(train2_train['label'])

### 赛题预测集数据

In [37]:
true_test_data = test2

In [38]:
# true_test_data = pd.concat([true_test_data,test2['id']],axis=1)

In [39]:
true_test_data

Unnamed: 0,id,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,...,"heartbeat_signals__fft_coefficient__attr_""angle""__coeff_6","heartbeat_signals__fft_coefficient__attr_""angle""__coeff_8","heartbeat_signals__fft_aggregated__aggtype_""centroid""","heartbeat_signals__fft_aggregated__aggtype_""variance""","heartbeat_signals__fft_aggregated__aggtype_""kurtosis""",heartbeat_signals__value_count__value_0,"heartbeat_signals__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC""",heartbeat_signals__number_crossing_m__m_0,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1
0,100000.0,0.991699,1.000000,0.631836,0.136230,0.041412,0.102722,0.120850,0.123413,0.107910,...,103.190638,154.789155,24.495366,631.324530,6.619765,97.0,8.0,3.0,2.356864,2.587925
1,100001.0,0.607422,0.541504,0.340576,0.000000,0.090698,0.164917,0.195068,0.168823,0.198853,...,84.137325,173.766880,12.262886,419.784302,8.790172,1.0,5.0,2.0,4.656875,4.882383
2,100002.0,0.975098,0.670898,0.686523,0.708496,0.718750,0.716797,0.720703,0.701660,0.596680,...,-77.817824,-71.976591,21.197021,784.217329,4.706507,75.0,1.0,1.0,3.321028,3.516715
3,100003.0,0.995605,0.916992,0.520996,0.000000,0.221802,0.404053,0.490479,0.527344,0.518066,...,-77.842460,12.827032,21.249712,686.291574,5.252847,123.0,11.0,3.0,1.806294,1.979305
4,100004.0,1.000000,0.888184,0.745605,0.531738,0.380371,0.224609,0.091125,0.057648,0.003914,...,16.466894,31.166220,22.783216,700.874083,5.597247,91.0,15.0,5.0,2.960568,3.168085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,119995.0,1.000000,0.833008,0.634277,0.639160,0.624023,0.598145,0.613770,0.624023,0.628906,...,-86.572686,-85.127448,21.318383,755.820492,4.619303,71.0,7.0,1.0,3.625398,3.843586
19996,119996.0,1.000000,0.826172,0.452148,0.082214,0.000000,0.137085,0.201050,0.165649,0.158081,...,-17.603452,-25.376913,23.354989,695.770289,5.519805,119.0,9.0,3.0,1.955659,2.081946
19997,119997.0,0.951660,0.916504,0.667480,0.352051,0.255371,0.197388,0.173584,0.141968,0.134521,...,-8.970786,-82.038373,22.138171,679.671776,5.573726,100.0,3.0,3.0,2.497097,2.663404
19998,119998.0,0.927734,0.677246,0.242920,0.055359,0.102112,0.072266,0.021011,0.038300,0.048553,...,12.651863,-76.415419,27.000844,718.096237,6.228419,105.0,6.0,3.0,2.912829,3.021449


In [40]:
test2['id']

0        100000.0
1        100001.0
2        100002.0
3        100003.0
4        100004.0
           ...   
19995    119995.0
19996    119996.0
19997    119997.0
19998    119998.0
19999    119999.0
Name: id, Length: 20000, dtype: float32

In [41]:
true_test_data.head()

Unnamed: 0,id,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,...,"heartbeat_signals__fft_coefficient__attr_""angle""__coeff_6","heartbeat_signals__fft_coefficient__attr_""angle""__coeff_8","heartbeat_signals__fft_aggregated__aggtype_""centroid""","heartbeat_signals__fft_aggregated__aggtype_""variance""","heartbeat_signals__fft_aggregated__aggtype_""kurtosis""",heartbeat_signals__value_count__value_0,"heartbeat_signals__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC""",heartbeat_signals__number_crossing_m__m_0,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1
0,100000.0,0.991699,1.0,0.631836,0.13623,0.041412,0.102722,0.12085,0.123413,0.10791,...,103.190638,154.789155,24.495366,631.32453,6.619765,97.0,8.0,3.0,2.356864,2.587925
1,100001.0,0.607422,0.541504,0.340576,0.0,0.090698,0.164917,0.195068,0.168823,0.198853,...,84.137325,173.76688,12.262886,419.784302,8.790172,1.0,5.0,2.0,4.656875,4.882383
2,100002.0,0.975098,0.670898,0.686523,0.708496,0.71875,0.716797,0.720703,0.70166,0.59668,...,-77.817824,-71.976591,21.197021,784.217329,4.706507,75.0,1.0,1.0,3.321028,3.516715
3,100003.0,0.995605,0.916992,0.520996,0.0,0.221802,0.404053,0.490479,0.527344,0.518066,...,-77.84246,12.827032,21.249712,686.291574,5.252847,123.0,11.0,3.0,1.806294,1.979305
4,100004.0,1.0,0.888184,0.745605,0.531738,0.380371,0.224609,0.091125,0.057648,0.003914,...,16.466894,31.16622,22.783216,700.874083,5.597247,91.0,15.0,5.0,2.960568,3.168085


In [42]:
# Counter(true_train_label)

### 过采样 & 欠采样 & SMOTE

In [43]:
# over：过采样；under：欠采样；smote：SMOTE采样
def ods(data, method):
    data1 = data.drop(['id', 'label'], axis=1) 
    label = data['label']
    if method == 'over':
        ros = RandomOverSampler(random_state=2021)
        x_train, y_label = ros.fit_resample(data1 , label)
    elif method == 'under':
        rus = RandomUnderSampler(random_state=2021)
        x_train, y_label = rus.fit_resample(data1 , label)
    elif method == 'smote':
        smote = SMOTE(random_state=2021)
        x_train, y_label = smote.fit_resample(data1 , label)
    else:
        print("输入格式错误！！！")
    
    return x_train, y_label

In [44]:
# true_test_data = true_test_data.drop(['id'],axis=1)

In [45]:
# true_test_data = pd.concat([true_test_data,],axis=1)

In [46]:
x_train, y_label = ods(train2, 'smote')  # train2 是10万r*206columns

In [47]:
x_train

Unnamed: 0,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,s_9,...,"heartbeat_signals__fft_coefficient__attr_""angle""__coeff_6","heartbeat_signals__fft_coefficient__attr_""angle""__coeff_8","heartbeat_signals__fft_aggregated__aggtype_""centroid""","heartbeat_signals__fft_aggregated__aggtype_""variance""","heartbeat_signals__fft_aggregated__aggtype_""kurtosis""",heartbeat_signals__value_count__value_0,"heartbeat_signals__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC""",heartbeat_signals__number_crossing_m__m_0,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1
0,0.991211,0.943359,0.764648,0.618652,0.379639,0.190796,0.040222,0.026001,0.031708,0.065552,...,70.470093,145.307714,20.048266,640.867764,5.659187,93.000000,15.000000,3.000000,2.500658,2.722686
1,0.971680,0.929199,0.572754,0.178467,0.122986,0.132324,0.094421,0.089600,0.030487,0.040497,...,13.357471,35.674928,24.557446,657.447951,6.283027,84.000000,4.000000,3.000000,3.065802,3.224835
2,1.000000,0.958984,0.701172,0.231812,0.000000,0.080688,0.128418,0.187500,0.280762,0.328369,...,-87.961613,-15.800108,25.010491,723.128806,5.731839,149.000000,0.000000,3.000000,1.406001,1.509478
3,0.975586,0.934082,0.659668,0.249878,0.237061,0.281494,0.249878,0.249878,0.241455,0.230713,...,92.320874,-0.300105,21.766021,672.013235,5.526573,61.000000,4.000000,3.000000,3.534354,3.854177
4,0.000000,0.055817,0.261230,0.359863,0.433105,0.453613,0.499023,0.542969,0.616699,0.676758,...,170.814310,-163.080746,8.833336,276.246018,11.708390,106.000000,15.000000,2.000000,2.165627,2.323993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257303,0.959961,0.917969,0.858398,0.819824,0.738770,0.635254,0.517578,0.365723,0.227417,0.175293,...,-58.946267,-49.237680,15.188283,580.427020,6.339740,68.358069,1.000000,3.000000,2.401671,2.604898
257304,0.947754,0.910645,0.855469,0.808105,0.736816,0.650879,0.538574,0.400146,0.267334,0.196777,...,-43.383672,-63.527542,14.933410,569.751328,6.506247,69.000000,2.000000,1.000000,2.432703,2.614123
257305,1.000000,0.606934,0.618164,0.618164,0.624023,0.586914,0.597168,0.573730,0.470459,0.322754,...,-83.570006,-81.173967,21.957511,821.987131,4.587809,73.000000,2.014902,3.000000,3.502497,3.693988
257306,0.935547,0.602539,0.637207,0.634766,0.610840,0.595215,0.549805,0.429443,0.290283,0.072449,...,-73.432503,-51.321028,23.516316,843.863401,4.578814,76.203361,0.000000,3.406722,3.049860,3.345728


In [48]:
Counter(y_label)

Counter({0.0: 64327, 2.0: 64327, 3.0: 64327, 1.0: 64327})

### 评测函数

In [49]:
def abs_sum(y_pre,y_tru):
    y_pre=np.array(y_pre)
    y_tru=np.array(y_tru)
    loss=sum(sum(abs(y_pre-y_tru)))
    return loss

### 模型训练

In [50]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 10000
    seed = 2021
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    
    #设置测试集，输出矩阵。每一组数据输出：[0,0,0,0]以概率值填入
    test = np.zeros((test_x.shape[0],4))
    
    #交叉验证分数
    cv_scores = []
    onehot_encoder = OneHotEncoder(sparse=False)
    
    #将训练集「K折」操作，i值代表第（i+1）折。每一个K折都进行「数据混乱：随机」操作
    #train_index：用于训练的（K-1）的样本索引值
    #valid_index：剩下1折样本索引值，用于给出「训练误差」
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        if i < 1:
            #打印第（i+1）折个模型结果
            print('************************************ {} ************************************'.format(str(i+1)))
            
            #将训练集分为：真正训练的数据（K-1折），和 训练集中的测试数据（1折）
            trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
            
            
            #LGB模型
            if clf_name == "lgb":
                
                #训练样本
                train_matrix = clf.Dataset(trn_x, label=trn_y)
                #训练集中测试样本
                valid_matrix = clf.Dataset(val_x, label=val_y)
                
                #参数设置
                params = {
                            'boosting_type': 'gbdt',          #boosting方式
                            'objective': 'multiclass',        #任务类型为「多分类」
                            'num_class': 4,                   #类别个数
                            'num_leaves': 2 ** 6,             #最大的叶子数
                            'feature_fraction': 0.9,          #原来是0.8
                            'bagging_fraction': 0.9,          #原来是0.8
                            'bagging_freq': 5,                #每5次迭代，进行一次bagging
                            'learning_rate': 0.01,            #学习效率：原来是0.1
                            'seed': seed,                     #seed值，保证模型复现
                            'nthread': 28,                    #
                            'n_jobs':24,                      #多线程
                            'verbose': 1,
                            'min_data_in_leaf':10,           #叶子可能具有的最小记录数
                            'verbose': -1,

                            
                }
              
                #模型
                model = clf.train(params, train_set=train_matrix, num_boost_round=10000, valid_sets=valid_matrix, 
                      verbose_eval=100, early_stopping_rounds=500,) #feval=f1_score_vali,

        
                val_pred = model.predict(val_x, num_iteration=model.best_iteration)
                test_pred = model.predict(test_x, num_iteration=model.best_iteration) 

            val_y = np.array(val_y).reshape(-1, 1)
            val_y = onehot_encoder.fit_transform(val_y)
            print('预测的概率矩阵为：')
            print(test_pred)
            
            #将预测结果填入到test里面，这是一个「i个模型结果累加过程」
            test += test_pred

            #评测公式
            score = abs_sum(val_y, val_pred)
            cv_scores.append(score)
            print(cv_scores)
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    
    #下面公式是什么含义呢？为啥要除以「K折数」？：i个模型输出结果的平均值。
    test = test / 1

    return test

In [51]:
def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_test

In [52]:
lgb_test = lgb_model(x_train, y_label, true_test_data)

************************************ 1 ************************************
Training until validation scores don't improve for 500 rounds.
[100]	valid_0's multi_logloss: 0.64217
[200]	valid_0's multi_logloss: 0.3704
[300]	valid_0's multi_logloss: 0.249891
[400]	valid_0's multi_logloss: 0.164146
[500]	valid_0's multi_logloss: 0.107895
[600]	valid_0's multi_logloss: 0.0715412
[700]	valid_0's multi_logloss: 0.0474328
[800]	valid_0's multi_logloss: 0.032811
[900]	valid_0's multi_logloss: 0.0237277
[1000]	valid_0's multi_logloss: 0.0178825
[1100]	valid_0's multi_logloss: 0.0142131
[1200]	valid_0's multi_logloss: 0.0114233
[1300]	valid_0's multi_logloss: 0.00911815
[1400]	valid_0's multi_logloss: 0.00754864
[1500]	valid_0's multi_logloss: 0.00635329
[1600]	valid_0's multi_logloss: 0.00542671
[1700]	valid_0's multi_logloss: 0.00457638
[1800]	valid_0's multi_logloss: 0.00377002
[1900]	valid_0's multi_logloss: 0.00310929
[2000]	valid_0's multi_logloss: 0.00261227
[2100]	valid_0's multi_logloss:

In [53]:
y_label2 = lgb_test

### One-Hot编码

In [54]:
onehot_encoder = OneHotEncoder(sparse=False)
y_label2 = np.array(label2).reshape(-1, 1)
train2_test_label = onehot_encoder.fit_transform(y_label2)

NameError: name 'label2' is not defined

### 评测函数值

In [None]:
abs_sum(lgb_test, train2_test_label)

### 学习赛预测结果

### 保存数据

In [57]:
result = pd.DataFrame(lgb_test)
result

Unnamed: 0,0,1,2,3
0,0.998252,2.104354e-07,0.001706,4.183628e-05
1,0.386242,7.297118e-02,0.540786,4.869948e-07
2,0.257968,2.220511e-04,0.739561,2.248778e-03
3,0.999471,1.529575e-10,0.000529,5.719669e-08
4,0.999939,3.343197e-07,0.000061,2.960921e-09
...,...,...,...,...
19995,0.999994,2.375997e-09,0.000006,2.448188e-08
19996,0.641411,2.342257e-07,0.358589,7.092015e-08
19997,0.000065,3.452424e-09,0.999935,3.169947e-11
19998,0.999972,1.974439e-06,0.000026,7.392556e-08


In [58]:
r=pd.read_csv('./sample_submit.csv')
r['label_0']=result[0]
r['label_1']=result[1]
r['label_2']=result[2]
r['label_3']=result[3]
r.to_csv('./submit33.csv',index=False)

In [59]:
for index,row in r.iterrows():
    row_max = max(list(row)[1:])
    if row_max > 0.6:
        for i in range(1,5):
            if row[i] > 0.6:
                r.iloc[index,i]=1
            else:
                r.iloc[index,i]=0

In [60]:
r.to_csv('./submit33_youhua.csv',index=False)