In [32]:
from scipy.io import loadmat 
import pandas as pd 
import numpy as np 
np.random.seed(2020)
import matplotlib.pyplot as plt

In [33]:
train = loadmat('train/AECG_Train.mat')
train.keys()

dict_keys(['__header__', '__version__', '__globals__', 'DataTrain'])

## 1. 提取训练数据

In [11]:
train_x = train['DataTrain'][0, 0][0]
train_y = train['DataTrain'][0, 0][1]

data = []
label = []
for i in range(len(train_x)):
    for j in range(len(train_x[0])):
        data.append(list(train_x[i][j].reshape(-1)))
        label.append(list(train_y[i][j].reshape(-1))[0])

In [12]:
train = pd.DataFrame()
train['data'] = data
train['label'] = label

In [13]:
train

Unnamed: 0,data,label
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 3.22, 4.83...",1
1,"[28.27, 27.76, 22.09, 21.17, 24.05, 27.64, 31....",2
2,"[0.0, 0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.51, 2.1,...",3
3,"[99.98, 99.98, 99.68, 96.09, 86.47, 65.58, 48....",4
4,"[22.88, 22.97, 23.14, 24.66, 23.85, 22.02, 21....",1
...,...,...
71,"[25.59, 27.93, 32.67, 39.99, 41.82, 42.19, 41....",4
72,"[27.71, 23.54, 21.44, 21.0, 23.8, 23.73, 22.92...",1
73,"[48.44, 44.6, 39.62, 35.16, 31.2, 30.08, 27.27...",2
74,"[25.78, 25.63, 27.93, 27.56, 26.46, 26.37, 27....",3


## 2. 构建数据集
> 在序列中随机截取长度为 320 的段序列， 每个序列抽取1000次

In [14]:
data = []
label = []
extract_num = 1000
def random_extract(src, length=320):
    start = np.random.randint(0, len(src) - length)
    end = start + length
    return src[start: end]

for v,l in train.values:
    for i in range(extract_num):
        data.append(random_extract(v))
        label.append(l)

In [15]:
data = np.concatenate([np.array(data), np.array(label).reshape(-1,1)], axis=1)
data.shape

(76000, 321)

In [16]:
train_df = pd.DataFrame(data)
train_df.columns = [f'x{i}' for i in range(320)] + ['label']
train_df.to_pickle('train_df.pkl')

## 3. 提取测试数据

In [17]:
data = []
test = loadmat('test/Data.mat')
# test['Data'][0][0][0][1000][0].shape

for v in test['Data'][0][0][0]:
    data.append(list(v[0].reshape(-1)))
data = np.array(data)

In [18]:
test_df = pd.DataFrame(data)
test_df.columns = [f'x{i}' for i in range(320)]
test_df.to_pickle('test_df.pkl')

## 4. 建模

In [34]:
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMClassifier
feas = train_df.columns[:-1]
lbl = train_df.columns[-1]
X_train, X_test, y_train, y_test = train_test_split(train_df[feas], train_df[lbl], test_size=0.2, random_state=42)

In [35]:
lgb_clf = LGBMClassifier(n_estimators=100)
lgb_clf.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [36]:
pred = lgb_clf.predict(X_test)

In [37]:
from sklearn.metrics import f1_score
f1_score(y_test, pred, average='micro')

0.39881578947368423

In [38]:
sub = pd.read_csv('../heart.csv')
res = lgb_clf.predict(test_df)
sub.label = res

In [39]:
sub.to_csv('sub_1.csv', index=None)
sub.label.value_counts()

1.0    656
2.0    570
3.0    501
4.0    435
Name: label, dtype: int64