# Task3 特征工程

此部分为零基础入门数据挖掘-心跳信号分类预测的 Task3 特征工程部分，带你来了解时间序列特征工程以及分析方法，欢迎大家后续多多交流。

赛题：零基础入门数据挖掘-心跳信号分类预测

项目地址：
比赛地址：

## 3.1 学习目标

* 学习时间序列数据的特征预处理方法
* 学习时间序列特征处理工具 Tsfresh（TimeSeries Fresh）的使用

## 3.2 内容介绍
* 数据预处理
	* 时间序列数据格式处理
	* 加入时间步特征time
* 特征工程
	* 时间序列特征构造
	* 特征筛选
	* 使用 tsfresh 进行时间序列特征处理

## 3.3 代码示例

### 3.3.1 导入包并读取数据

In [1]:
import pandas as pd
import numpy as np
import tsfresh as tsf
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute

In [2]:
# 数据读取
data_train = pd.read_csv("train.csv")
data_test_A = pd.read_csv("testA.csv")

print(data_train.shape)
print(data_test_A.shape)

(100000, 3)
(20000, 2)


In [3]:
data_train.head()

Unnamed: 0,id,heartbeat_signals,label
0,0,"0.9912297987616655,0.9435330436439665,0.764677...",0.0
1,1,"0.9714822034884503,0.9289687459588268,0.572932...",0.0
2,2,"1.0,0.9591487564065292,0.7013782792997189,0.23...",2.0
3,3,"0.9757952826275774,0.9340884687738161,0.659636...",0.0
4,4,"0.0,0.055816398940721094,0.26129357194994196,0...",2.0


In [4]:
data_test_A

Unnamed: 0,id,heartbeat_signals
0,100000,"0.9915713654170097,1.0,0.6318163407681274,0.13..."
1,100001,"0.6075533139615096,0.5417083883163654,0.340694..."
2,100002,"0.9752726292239277,0.6710965234906665,0.686758..."
3,100003,"0.9956348033996116,0.9170249621481004,0.521096..."
4,100004,"1.0,0.8879490481178918,0.745564725322326,0.531..."
...,...,...
19995,119995,"1.0,0.8330283177934747,0.6340472606311671,0.63..."
19996,119996,"1.0,0.8259705825857048,0.4521053488322387,0.08..."
19997,119997,"0.951744840752379,0.9162611283848351,0.6675251..."
19998,119998,"0.9276692903808186,0.6771898159607004,0.242906..."


In [5]:
# 对心电特征进行行转列处理，同时为每个心电信号加入时间步特征time
train_heartbeat_df = data_train["heartbeat_signals"].str.split(",", expand=True).stack()
train_heartbeat_df = train_heartbeat_df.reset_index()
train_heartbeat_df = train_heartbeat_df.set_index("level_0")
train_heartbeat_df.index.name = None
train_heartbeat_df.rename(columns={"level_1":"time", 0:"heartbeat_signals"}, inplace=True)
train_heartbeat_df["heartbeat_signals"] = train_heartbeat_df["heartbeat_signals"].astype(float)

train_heartbeat_df

Unnamed: 0,time,heartbeat_signals
0,0,0.991230
0,1,0.943533
0,2,0.764677
0,3,0.618571
0,4,0.379632
...,...,...
99999,200,0.000000
99999,201,0.000000
99999,202,0.000000
99999,203,0.000000


In [6]:
data_train_label = data_train["label"]
data_train = data_train.drop("label", axis=1)
data_train = data_train.drop("heartbeat_signals", axis=1)
data_train = data_train.join(train_heartbeat_df)

data_train

Unnamed: 0,id,time,heartbeat_signals
0,0,0,0.991230
0,0,1,0.943533
0,0,2,0.764677
0,0,3,0.618571
0,0,4,0.379632
...,...,...,...
99999,99999,200,0.000000
99999,99999,201,0.000000
99999,99999,202,0.000000
99999,99999,203,0.000000


In [7]:
from tsfresh.feature_extraction import extract_features, MinimalFCParameters

# 特征提取
train_features = extract_features(data_train, column_id='id', column_sort='time')
train_features

Feature Extraction: 100%|█████████████████████████████████████████████████████████████| 15/15 [41:20<00:00, 165.34s/it]


Unnamed: 0,heartbeat_signals__variance_larger_than_standard_deviation,heartbeat_signals__has_duplicate_max,heartbeat_signals__has_duplicate_min,heartbeat_signals__has_duplicate,heartbeat_signals__sum_values,heartbeat_signals__abs_energy,heartbeat_signals__mean_abs_change,heartbeat_signals__mean_change,heartbeat_signals__mean_second_derivative_central,heartbeat_signals__median,...,heartbeat_signals__permutation_entropy__dimension_5__tau_1,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1,heartbeat_signals__query_similarity_count__query_None__threshold_0.0,"heartbeat_signals__matrix_profile__feature_""min""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""max""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""mean""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""median""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""25""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""75""__threshold_0.98"
0,0.0,0.0,1.0,1.0,38.927945,18.216197,0.019894,-0.004859,0.000117,0.125531,...,2.184420,2.500658,2.722686,,6.445546,12.165525,10.246524,10.746992,8.388625,11.484910
1,0.0,0.0,1.0,1.0,19.445634,7.705092,0.019952,-0.004762,0.000105,0.030481,...,2.710933,3.065802,3.224835,,3.209140,12.649111,9.031069,9.437545,6.723180,12.094899
2,0.0,0.0,1.0,1.0,21.192974,9.140423,0.009863,-0.004902,0.000101,0.000000,...,1.263370,1.406001,1.509478,,3.054539,8.246211,7.370478,8.246211,5.966122,8.246211
3,0.0,0.0,1.0,1.0,42.113066,15.757623,0.018743,-0.004783,0.000103,0.241397,...,2.986728,3.534354,3.854177,,3.010557,9.797959,6.331360,6.406440,5.266743,7.091706
4,0.0,0.0,1.0,1.0,69.756786,51.229616,0.014514,0.000000,-0.000137,0.000000,...,1.914511,2.165627,2.323993,,9.181236,13.429784,9.959913,9.516290,9.286013,10.270925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.0,1.0,1.0,63.323449,28.742238,0.023588,-0.004902,0.000794,0.388402,...,2.873602,3.391830,3.679969,,2.436377,9.591663,5.635231,6.366205,3.596982,7.033638
99996,0.0,0.0,1.0,1.0,69.657534,31.866323,0.017373,-0.004543,0.000051,0.421138,...,3.085504,3.728881,4.095457,,1.415410,7.483315,2.893592,2.684349,2.049241,3.334109
99997,0.0,0.0,1.0,1.0,40.897057,16.412857,0.019470,-0.004538,0.000834,0.213306,...,2.601062,2.996962,3.293562,,5.748652,12.165525,8.524637,7.983410,7.062217,10.081756
99998,0.0,0.0,1.0,1.0,42.333303,14.281281,0.017032,-0.004902,0.000013,0.264974,...,3.236950,3.793512,4.018302,,2.346822,8.246211,4.951374,4.727535,4.069786,5.615282


In [8]:
train_features.shape

(100000, 787)

In [9]:
from tsfresh.utilities.dataframe_functions import impute

# 去除抽取特征中的NaN值
impute(train_features)



Unnamed: 0,heartbeat_signals__variance_larger_than_standard_deviation,heartbeat_signals__has_duplicate_max,heartbeat_signals__has_duplicate_min,heartbeat_signals__has_duplicate,heartbeat_signals__sum_values,heartbeat_signals__abs_energy,heartbeat_signals__mean_abs_change,heartbeat_signals__mean_change,heartbeat_signals__mean_second_derivative_central,heartbeat_signals__median,...,heartbeat_signals__permutation_entropy__dimension_5__tau_1,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1,heartbeat_signals__query_similarity_count__query_None__threshold_0.0,"heartbeat_signals__matrix_profile__feature_""min""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""max""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""mean""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""median""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""25""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""75""__threshold_0.98"
0,0.0,0.0,1.0,1.0,38.927945,18.216197,0.019894,-0.004859,0.000117,0.125531,...,2.184420,2.500658,2.722686,0.0,6.445546,12.165525,10.246524,10.746992,8.388625,11.484910
1,0.0,0.0,1.0,1.0,19.445634,7.705092,0.019952,-0.004762,0.000105,0.030481,...,2.710933,3.065802,3.224835,0.0,3.209140,12.649111,9.031069,9.437545,6.723180,12.094899
2,0.0,0.0,1.0,1.0,21.192974,9.140423,0.009863,-0.004902,0.000101,0.000000,...,1.263370,1.406001,1.509478,0.0,3.054539,8.246211,7.370478,8.246211,5.966122,8.246211
3,0.0,0.0,1.0,1.0,42.113066,15.757623,0.018743,-0.004783,0.000103,0.241397,...,2.986728,3.534354,3.854177,0.0,3.010557,9.797959,6.331360,6.406440,5.266743,7.091706
4,0.0,0.0,1.0,1.0,69.756786,51.229616,0.014514,0.000000,-0.000137,0.000000,...,1.914511,2.165627,2.323993,0.0,9.181236,13.429784,9.959913,9.516290,9.286013,10.270925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.0,1.0,1.0,63.323449,28.742238,0.023588,-0.004902,0.000794,0.388402,...,2.873602,3.391830,3.679969,0.0,2.436377,9.591663,5.635231,6.366205,3.596982,7.033638
99996,0.0,0.0,1.0,1.0,69.657534,31.866323,0.017373,-0.004543,0.000051,0.421138,...,3.085504,3.728881,4.095457,0.0,1.415410,7.483315,2.893592,2.684349,2.049241,3.334109
99997,0.0,0.0,1.0,1.0,40.897057,16.412857,0.019470,-0.004538,0.000834,0.213306,...,2.601062,2.996962,3.293562,0.0,5.748652,12.165525,8.524637,7.983410,7.062217,10.081756
99998,0.0,0.0,1.0,1.0,42.333303,14.281281,0.017032,-0.004902,0.000013,0.264974,...,3.236950,3.793512,4.018302,0.0,2.346822,8.246211,4.951374,4.727535,4.069786,5.615282


In [10]:
from tsfresh import select_features

# 按照特征和数据label之间的相关性进行特征选择
train_features_filtered = select_features(train_features, data_train_label)

train_features_filtered

Unnamed: 0,heartbeat_signals__sum_values,"heartbeat_signals__fft_coefficient__attr_""abs""__coeff_38","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_37","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_36","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_35","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_34","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_33","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_32","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_31","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_30",...,"heartbeat_signals__fft_coefficient__attr_""abs""__coeff_84","heartbeat_signals__fft_coefficient__attr_""imag""__coeff_97","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_90","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_94","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_92","heartbeat_signals__fft_coefficient__attr_""real""__coeff_97","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_75","heartbeat_signals__fft_coefficient__attr_""real""__coeff_88","heartbeat_signals__fft_coefficient__attr_""real""__coeff_92","heartbeat_signals__fft_coefficient__attr_""real""__coeff_83"
0,38.927945,0.660949,1.090709,0.848728,1.168685,0.982133,1.223496,1.236300,1.104172,1.497129,...,0.531883,-0.047438,0.554370,0.307586,0.564596,0.562960,0.591859,0.504124,0.528450,0.473568
1,19.445634,1.718217,1.280923,1.850706,1.460752,1.924501,1.925485,1.715938,2.079957,1.818636,...,0.563590,-0.109579,0.697446,0.398073,0.640969,0.270192,0.224925,0.645082,0.635135,0.297325
2,21.192974,1.814281,1.619051,1.215343,1.787166,2.146987,1.686190,1.540137,2.291031,2.403422,...,0.712487,-0.074042,0.321703,0.390386,0.716929,0.316524,0.422077,0.722742,0.680590,0.383754
3,42.113066,2.109550,0.619634,2.366413,2.071539,1.000340,2.728281,1.391727,2.017176,2.610492,...,0.601499,-0.184248,0.564669,0.623353,0.466980,0.651774,0.308915,0.550097,0.466904,0.494024
4,69.756786,0.194549,0.348882,0.092119,0.653924,0.231422,1.080003,0.711244,1.357904,1.237998,...,0.015292,0.070505,0.065835,0.051780,0.092940,0.103773,0.179405,-0.089611,0.091841,0.056867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,63.323449,0.840651,1.186210,1.396236,0.417221,2.036034,1.659054,0.500584,1.693545,0.859932,...,0.779955,0.005525,0.486013,0.273372,0.705386,0.602898,0.447929,0.474844,0.564266,0.133969
99996,69.657534,1.557787,1.393960,0.989147,1.611333,1.793044,1.092325,0.507138,1.763940,2.677643,...,0.539489,0.114670,0.579498,0.417226,0.270110,0.556596,0.703258,0.462312,0.269719,0.539236
99997,40.897057,0.469758,1.000355,0.706395,1.190514,0.674603,1.632769,0.229008,2.027802,0.302457,...,0.282597,-0.474629,0.460647,0.478341,0.527891,0.904111,0.728529,0.178410,0.500813,0.773985
99998,42.333303,0.992948,1.354894,2.238589,1.237608,1.325212,2.785515,1.918571,0.814167,2.613950,...,0.594252,-0.162106,0.694276,0.681025,0.357196,0.498088,0.433297,0.406154,0.324771,0.340727


In [12]:
train_features_filtered.to_csv('tsfresh_train.csv', index=False)

In [20]:
features = train_features_filtered.columns.to_list()

In [13]:
# 对心电特征进行行转列处理，同时为每个心电信号加入时间步特征time
test_heartbeat_df = data_test_A["heartbeat_signals"].str.split(",", expand=True).stack()
test_heartbeat_df = test_heartbeat_df.reset_index()
test_heartbeat_df = test_heartbeat_df.set_index("level_0")
test_heartbeat_df.index.name = None
test_heartbeat_df.rename(columns={"level_1":"time", 0:"heartbeat_signals"}, inplace=True)
test_heartbeat_df["heartbeat_signals"] = test_heartbeat_df["heartbeat_signals"].astype(float)

test_heartbeat_df

Unnamed: 0,time,heartbeat_signals
0,0,0.991571
0,1,1.000000
0,2,0.631816
0,3,0.136230
0,4,0.041420
...,...,...
19999,200,0.000000
19999,201,0.000000
19999,202,0.000000
19999,203,0.000000


In [15]:
data_test_A = data_test_A.drop("heartbeat_signals", axis=1)
data_test_A = data_test_A.join(test_heartbeat_df)

data_test_A

Unnamed: 0,id,time,heartbeat_signals
0,100000,0,0.991571
0,100000,1,1.000000
0,100000,2,0.631816
0,100000,3,0.136230
0,100000,4,0.041420
...,...,...,...
19999,119999,200,0.000000
19999,119999,201,0.000000
19999,119999,202,0.000000
19999,119999,203,0.000000


In [16]:
from tsfresh.feature_extraction import extract_features, MinimalFCParameters

# 特征提取
train_features = extract_features(data_test_A, column_id='id', column_sort='time')
train_features

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 15/15 [08:21<00:00, 33.44s/it]


Unnamed: 0,heartbeat_signals__variance_larger_than_standard_deviation,heartbeat_signals__has_duplicate_max,heartbeat_signals__has_duplicate_min,heartbeat_signals__has_duplicate,heartbeat_signals__sum_values,heartbeat_signals__abs_energy,heartbeat_signals__mean_abs_change,heartbeat_signals__mean_change,heartbeat_signals__mean_second_derivative_central,heartbeat_signals__median,...,heartbeat_signals__permutation_entropy__dimension_5__tau_1,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1,heartbeat_signals__query_similarity_count__query_None__threshold_0.0,"heartbeat_signals__matrix_profile__feature_""min""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""max""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""mean""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""median""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""25""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""75""__threshold_0.98"
100000,0.0,0.0,1.0,1.0,19.229863,7.907934,0.018374,-0.004861,-0.000021,0.027745,...,2.021451,2.356864,2.587925,,2.692281,11.661904,9.456542,9.648131,8.525878,11.076665
100001,0.0,0.0,0.0,1.0,84.298932,38.292802,0.021483,-0.001195,0.000195,0.367241,...,4.099123,4.656875,4.882383,,0.939893,4.470801,2.584921,2.493456,1.542105,3.698142
100002,0.0,0.0,1.0,1.0,47.789921,21.287039,0.021610,-0.004781,0.000749,0.260611,...,2.900488,3.321028,3.516715,,5.684175,12.512693,9.751129,10.483467,8.221934,11.135462
100003,0.0,0.0,1.0,1.0,47.069011,28.749520,0.023874,-0.004881,0.000194,0.000000,...,1.530558,1.806294,1.979305,,0.909721,4.898979,3.531943,4.898979,1.675526,4.898979
100004,0.0,0.0,1.0,1.0,24.899397,10.177998,0.020548,-0.004902,0.000276,0.034859,...,2.626554,2.960568,3.168085,,5.033722,14.180087,11.450599,11.991037,11.825491,12.597792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,0.0,0.0,1.0,1.0,43.175130,18.967833,0.016106,-0.004902,0.000411,0.205399,...,3.150910,3.625398,3.843586,,3.687770,8.700294,5.991330,6.323450,4.155558,7.191577
119996,0.0,0.0,1.0,1.0,31.030782,14.413244,0.021473,-0.004902,0.000429,0.000000,...,1.732287,1.955659,2.081946,,10.456465,12.982197,11.338307,11.244766,10.763332,11.762948
119997,0.0,0.0,1.0,1.0,31.648623,13.083992,0.017566,-0.004665,0.000087,0.010807,...,2.248241,2.497097,2.663404,,6.037870,11.661904,9.312119,8.973721,8.064338,10.409977
119998,0.0,0.0,1.0,1.0,19.305442,6.700835,0.019937,-0.004547,0.000617,0.000000,...,2.538456,2.912829,3.021449,,10.350940,15.065584,12.961223,12.887409,12.118259,13.558463


In [17]:
from tsfresh.utilities.dataframe_functions import impute

# 去除抽取特征中的NaN值
impute(train_features)



Unnamed: 0,heartbeat_signals__variance_larger_than_standard_deviation,heartbeat_signals__has_duplicate_max,heartbeat_signals__has_duplicate_min,heartbeat_signals__has_duplicate,heartbeat_signals__sum_values,heartbeat_signals__abs_energy,heartbeat_signals__mean_abs_change,heartbeat_signals__mean_change,heartbeat_signals__mean_second_derivative_central,heartbeat_signals__median,...,heartbeat_signals__permutation_entropy__dimension_5__tau_1,heartbeat_signals__permutation_entropy__dimension_6__tau_1,heartbeat_signals__permutation_entropy__dimension_7__tau_1,heartbeat_signals__query_similarity_count__query_None__threshold_0.0,"heartbeat_signals__matrix_profile__feature_""min""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""max""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""mean""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""median""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""25""__threshold_0.98","heartbeat_signals__matrix_profile__feature_""75""__threshold_0.98"
100000,0.0,0.0,1.0,1.0,19.229863,7.907934,0.018374,-0.004861,-0.000021,0.027745,...,2.021451,2.356864,2.587925,0.0,2.692281,11.661904,9.456542,9.648131,8.525878,11.076665
100001,0.0,0.0,0.0,1.0,84.298932,38.292802,0.021483,-0.001195,0.000195,0.367241,...,4.099123,4.656875,4.882383,0.0,0.939893,4.470801,2.584921,2.493456,1.542105,3.698142
100002,0.0,0.0,1.0,1.0,47.789921,21.287039,0.021610,-0.004781,0.000749,0.260611,...,2.900488,3.321028,3.516715,0.0,5.684175,12.512693,9.751129,10.483467,8.221934,11.135462
100003,0.0,0.0,1.0,1.0,47.069011,28.749520,0.023874,-0.004881,0.000194,0.000000,...,1.530558,1.806294,1.979305,0.0,0.909721,4.898979,3.531943,4.898979,1.675526,4.898979
100004,0.0,0.0,1.0,1.0,24.899397,10.177998,0.020548,-0.004902,0.000276,0.034859,...,2.626554,2.960568,3.168085,0.0,5.033722,14.180087,11.450599,11.991037,11.825491,12.597792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,0.0,0.0,1.0,1.0,43.175130,18.967833,0.016106,-0.004902,0.000411,0.205399,...,3.150910,3.625398,3.843586,0.0,3.687770,8.700294,5.991330,6.323450,4.155558,7.191577
119996,0.0,0.0,1.0,1.0,31.030782,14.413244,0.021473,-0.004902,0.000429,0.000000,...,1.732287,1.955659,2.081946,0.0,10.456465,12.982197,11.338307,11.244766,10.763332,11.762948
119997,0.0,0.0,1.0,1.0,31.648623,13.083992,0.017566,-0.004665,0.000087,0.010807,...,2.248241,2.497097,2.663404,0.0,6.037870,11.661904,9.312119,8.973721,8.064338,10.409977
119998,0.0,0.0,1.0,1.0,19.305442,6.700835,0.019937,-0.004547,0.000617,0.000000,...,2.538456,2.912829,3.021449,0.0,10.350940,15.065584,12.961223,12.887409,12.118259,13.558463


In [22]:
tsfresh_test = train_features[features]

In [23]:
tsfresh_test.to_csv('tsfresh_test.csv', index=False)

In [25]:
tsfresh_test

Unnamed: 0,heartbeat_signals__sum_values,"heartbeat_signals__fft_coefficient__attr_""abs""__coeff_38","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_37","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_36","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_35","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_34","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_33","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_32","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_31","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_30",...,"heartbeat_signals__fft_coefficient__attr_""abs""__coeff_84","heartbeat_signals__fft_coefficient__attr_""imag""__coeff_97","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_90","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_94","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_92","heartbeat_signals__fft_coefficient__attr_""real""__coeff_97","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_75","heartbeat_signals__fft_coefficient__attr_""real""__coeff_88","heartbeat_signals__fft_coefficient__attr_""real""__coeff_92","heartbeat_signals__fft_coefficient__attr_""real""__coeff_83"
100000,19.229863,2.381214,0.832151,2.509869,1.082112,2.517858,1.656104,2.257162,2.213421,1.815374,...,0.563470,-0.040576,0.485441,0.472059,0.448018,0.449347,0.479950,0.480448,0.442279,0.355992
100001,84.298932,0.987660,0.856174,0.616261,0.293339,0.191558,0.528684,1.010080,1.478182,1.713876,...,0.037307,0.010074,0.272897,0.247538,0.286948,0.143829,0.189416,0.124293,0.154624,0.077530
100002,47.789921,0.696393,1.165387,1.004378,0.951231,1.542114,0.946219,1.673430,1.445220,1.118439,...,0.738423,-0.159505,0.418298,0.566628,0.849684,0.950851,0.779324,0.439255,0.839315,0.454957
100003,47.069011,3.137668,0.044897,3.392946,3.054217,0.726293,3.582653,2.414946,1.257669,3.188068,...,0.273142,0.366949,0.891690,0.214585,0.927562,0.648872,0.730178,0.606528,0.830105,0.662320
100004,24.899397,0.496010,1.401020,0.536501,1.712592,1.044629,1.533405,1.330258,1.251771,1.441028,...,0.644046,-0.129700,0.578560,0.783258,0.480598,0.485003,0.667111,0.594234,0.447980,0.511133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,43.175130,1.776937,0.211527,1.986940,0.393550,1.693620,1.139395,1.459990,1.734535,1.025180,...,0.546742,-0.060254,0.507950,0.560192,0.541534,0.249750,0.608796,0.455444,0.535306,0.268471
119996,31.030782,1.451045,2.483726,1.105440,1.979721,2.821799,0.475276,2.782573,2.827882,0.520034,...,0.491662,0.016413,0.480380,0.459172,0.363756,0.427028,0.544692,0.754834,0.361866,0.536087
119997,31.648623,2.141301,0.546706,2.340499,1.362651,1.942634,2.043679,0.994065,2.248144,1.007128,...,0.529880,0.001012,0.768960,0.834159,0.672114,0.520215,0.341519,0.713419,0.664354,0.370047
119998,19.305442,0.221708,2.355288,1.051282,1.742370,2.164058,0.435583,2.649994,1.190594,2.328580,...,0.527500,-0.103574,0.521222,0.426435,0.636887,0.446365,0.551442,0.503703,0.635246,0.258394


In [26]:
import os
import gc
import math

import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler


from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from tqdm import tqdm
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

In [27]:
def abs_sum(y_pre,y_tru):
    y_pre=np.array(y_pre)
    y_tru=np.array(y_tru)
    loss=sum(sum(abs(y_pre-y_tru)))
    return loss
    

In [28]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2021
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    test = np.zeros((test_x.shape[0],4))

    cv_scores = []
    onehot_encoder = OneHotEncoder(sparse=False)
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'num_class': 4,
                'num_leaves': 2 ** 5,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': seed,
                'nthread': 28,
                'n_jobs':24,
                'verbose': -1,
            }

            model = clf.train(params, 
                      train_set=train_matrix, 
                      valid_sets=valid_matrix, 
                      num_boost_round=2000, 
                      verbose_eval=100, 
                      early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration) 
            
        val_y=np.array(val_y).reshape(-1, 1)
        val_y = onehot_encoder.fit_transform(val_y)
        print('预测的概率矩阵为：')
        print(test_pred)
        test += test_pred
        score=abs_sum(val_y, val_pred)
        cv_scores.append(score)
        print(cv_scores)
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    test=test/kf.n_splits

    return test

In [29]:
def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_test

In [35]:
tsfresh_test

Unnamed: 0,heartbeat_signals__sum_values,"heartbeat_signals__fft_coefficient__attr_""abs""__coeff_38","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_37","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_36","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_35","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_34","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_33","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_32","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_31","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_30",...,"heartbeat_signals__fft_coefficient__attr_""abs""__coeff_84","heartbeat_signals__fft_coefficient__attr_""imag""__coeff_97","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_90","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_94","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_92","heartbeat_signals__fft_coefficient__attr_""real""__coeff_97","heartbeat_signals__fft_coefficient__attr_""abs""__coeff_75","heartbeat_signals__fft_coefficient__attr_""real""__coeff_88","heartbeat_signals__fft_coefficient__attr_""real""__coeff_92","heartbeat_signals__fft_coefficient__attr_""real""__coeff_83"
100000,19.229863,2.381214,0.832151,2.509869,1.082112,2.517858,1.656104,2.257162,2.213421,1.815374,...,0.563470,-0.040576,0.485441,0.472059,0.448018,0.449347,0.479950,0.480448,0.442279,0.355992
100001,84.298932,0.987660,0.856174,0.616261,0.293339,0.191558,0.528684,1.010080,1.478182,1.713876,...,0.037307,0.010074,0.272897,0.247538,0.286948,0.143829,0.189416,0.124293,0.154624,0.077530
100002,47.789921,0.696393,1.165387,1.004378,0.951231,1.542114,0.946219,1.673430,1.445220,1.118439,...,0.738423,-0.159505,0.418298,0.566628,0.849684,0.950851,0.779324,0.439255,0.839315,0.454957
100003,47.069011,3.137668,0.044897,3.392946,3.054217,0.726293,3.582653,2.414946,1.257669,3.188068,...,0.273142,0.366949,0.891690,0.214585,0.927562,0.648872,0.730178,0.606528,0.830105,0.662320
100004,24.899397,0.496010,1.401020,0.536501,1.712592,1.044629,1.533405,1.330258,1.251771,1.441028,...,0.644046,-0.129700,0.578560,0.783258,0.480598,0.485003,0.667111,0.594234,0.447980,0.511133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,43.175130,1.776937,0.211527,1.986940,0.393550,1.693620,1.139395,1.459990,1.734535,1.025180,...,0.546742,-0.060254,0.507950,0.560192,0.541534,0.249750,0.608796,0.455444,0.535306,0.268471
119996,31.030782,1.451045,2.483726,1.105440,1.979721,2.821799,0.475276,2.782573,2.827882,0.520034,...,0.491662,0.016413,0.480380,0.459172,0.363756,0.427028,0.544692,0.754834,0.361866,0.536087
119997,31.648623,2.141301,0.546706,2.340499,1.362651,1.942634,2.043679,0.994065,2.248144,1.007128,...,0.529880,0.001012,0.768960,0.834159,0.672114,0.520215,0.341519,0.713419,0.664354,0.370047
119998,19.305442,0.221708,2.355288,1.051282,1.742370,2.164058,0.435583,2.649994,1.190594,2.328580,...,0.527500,-0.103574,0.521222,0.426435,0.636887,0.446365,0.551442,0.503703,0.635246,0.258394


In [40]:
feature_map = {i:f's_{n}' for n,i in enumerate(features)}

In [41]:
train_features_filtered = train_features_filtered.rename(columns=feature_map)

In [42]:
tsfresh_test = tsfresh_test.rename(columns=feature_map)

In [45]:
lgb_test = lgb_model(train_features_filtered, data_train_label, tsfresh_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.0402619
[200]	valid_0's multi_logloss: 0.0423585
[300]	valid_0's multi_logloss: 0.0488172
Early stopping, best iteration is:
[123]	valid_0's multi_logloss: 0.03965
预测的概率矩阵为：
[[9.99757722e-01 2.24728793e-04 1.49845800e-05 2.56475162e-06]
 [1.60538375e-05 2.64160513e-05 9.99956634e-01 8.95616959e-07]
 [1.19391846e-06 3.84002243e-06 5.20043261e-06 9.99989766e-01]
 ...
 [9.29767379e-02 6.79148548e-05 9.06936430e-01 1.89172236e-05]
 [9.99854993e-01 1.36810730e-04 5.89649219e-06 2.29939056e-06]
 [9.94060665e-01 8.91385549e-04 1.05378128e-03 3.99416840e-03]]
[623.1365248002943]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.0407695
[200]	valid_0's multi_logloss: 0.0439811
Early stopping, best iteration 

In [46]:
temp=pd.DataFrame(lgb_test)
temp

Unnamed: 0,0,1,2,3
0,0.999441,0.000528,0.000024,0.000007
1,0.000015,0.000083,0.999899,0.000002
2,0.000002,0.000006,0.000009,0.999984
3,0.999855,0.000112,0.000030,0.000004
4,0.999920,0.000022,0.000044,0.000014
...,...,...,...,...
19995,0.992724,0.005848,0.000278,0.001150
19996,0.999220,0.000590,0.000168,0.000021
19997,0.087302,0.000138,0.912533,0.000027
19998,0.999854,0.000124,0.000019,0.000004


In [47]:
result=pd.read_csv('sample_submit.csv')
result['label_0']=temp[0]
result['label_1']=temp[1]
result['label_2']=temp[2]
result['label_3']=temp[3]
result.to_csv('submit.csv',index=False)

In [49]:
result

Unnamed: 0,id,label_0,label_1,label_2,label_3
0,100000,0.999441,0.000528,0.000024,0.000007
1,100001,0.000015,0.000083,0.999899,0.000002
2,100002,0.000002,0.000006,0.000009,0.999984
3,100003,0.999855,0.000112,0.000030,0.000004
4,100004,0.999920,0.000022,0.000044,0.000014
...,...,...,...,...,...
19995,119995,0.992724,0.005848,0.000278,0.001150
19996,119996,0.999220,0.000590,0.000168,0.000021
19997,119997,0.087302,0.000138,0.912533,0.000027
19998,119998,0.999854,0.000124,0.000019,0.000004


In [73]:
result2 = result.loc[:, ['label_0', 'label_1', 'label_2', 'label_3']].apply(lambda x: x==x.max(), axis=1).astype(int)

In [74]:
result2

Unnamed: 0,label_0,label_1,label_2,label_3
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,1,0,0,0
4,1,0,0,0
...,...,...,...,...
19995,1,0,0,0
19996,1,0,0,0
19997,0,0,1,0
19998,1,0,0,0


In [75]:
result=pd.read_csv('sample_submit.csv')

In [77]:
result[['label_0', 'label_1', 'label_2', 'label_3']] = result2

In [79]:
result.to_csv('submission.csv', index=False)