# 赛题简介

多分类预测算法

# 导入数据

In [85]:
import os
import gc
import math

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

In [86]:
from tqdm import tqdm
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

In [87]:
train =pd.read_csv(r'C:\Users\kk.luo\Python Code\Heartbeat-signal-classfication-prediction\train.csv')

In [88]:
train.head()

Unnamed: 0,id,heartbeat_signals,label
0,0,"0.9912297987616655,0.9435330436439665,0.764677...",0.0
1,1,"0.9714822034884503,0.9289687459588268,0.572932...",0.0
2,2,"1.0,0.9591487564065292,0.7013782792997189,0.23...",2.0
3,3,"0.9757952826275774,0.9340884687738161,0.659636...",0.0
4,4,"0.0,0.055816398940721094,0.26129357194994196,0...",2.0


In [89]:
test =pd.read_csv(r'C:\Users\kk.luo\Python Code\Heartbeat-signal-classfication-prediction\testA.csv')

In [90]:
test.head()

Unnamed: 0,id,heartbeat_signals
0,100000,"0.9915713654170097,1.0,0.6318163407681274,0.13..."
1,100001,"0.6075533139615096,0.5417083883163654,0.340694..."
2,100002,"0.9752726292239277,0.6710965234906665,0.686758..."
3,100003,"0.9956348033996116,0.9170249621481004,0.521096..."
4,100004,"1.0,0.8879490481178918,0.745564725322326,0.531..."


## 数据预处理 

In [91]:
def reduce_mem_usage(df):
    start_memory=df.memory_usage().sum() / 1024
    print('数据最开始的大小：{:.2f}MB'.format(start_memory))
    for col in df.columns:
        col_type=df[col].dtype
        
        if col_type !=object:
            c_min=df[col].min()
            c_max=df[col].max()
            
            if str(col_type)[:3]=='int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col]=df[col].astype('category')
            
        end_memory=df.memory_usage().sum()/10024
        print('数据最开始的大小：{:.2f}MB'.format(end_memory))
        
        print('内存减少：{:.1f}%'.format(100*(end_memory- start_memory)/start_memory))
        
        return df
                

In [92]:
train_list=[]
for items in train.values:
    train_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])
    
    
train=pd.DataFrame(train_list)
train.columns=['id']+['s_'+str(i) for i in range(len(train_list[0])-2)]+['label']
train=reduce_mem_usage(train)

数据最开始的大小：161718.88MB
数据最开始的大小：16480.46MB
内存减少：-89.8%


In [93]:
test_list=[]
for items in test.values:
    test_list.append([items[0]] + [float(i) for i in items[1].split(',')])
    
    
test=pd.DataFrame(test_list)
test.columns=['id']+['s_'+str(i) for i in range(len(test_list[0])-1)]
test=reduce_mem_usage(test)

数据最开始的大小：32187.62MB
数据最开始的大小：3280.14MB
内存减少：-89.8%


In [94]:
train.head()

Unnamed: 0,id,s_0,s_1,s_2,s_3,s_4,s_5,s_6,s_7,s_8,...,s_196,s_197,s_198,s_199,s_200,s_201,s_202,s_203,s_204,label
0,0,0.99123,0.943533,0.764677,0.618571,0.379632,0.190822,0.040237,0.025995,0.031709,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.971482,0.928969,0.572933,0.178457,0.122962,0.13236,0.094392,0.089575,0.030481,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,1.0,0.959149,0.701378,0.231778,0.0,0.080698,0.128376,0.187448,0.280826,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,3,0.975795,0.934088,0.659637,0.249921,0.237116,0.281445,0.249921,0.249921,0.241397,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.055816,0.261294,0.359847,0.433143,0.453698,0.499004,0.542796,0.616904,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


### 数据集合准备

In [95]:
x_train = train.drop(['id','label'], axis=1)
y_train = train['label']
x_test=test.drop(['id'], axis=1)

### 模型评价指标 

In [96]:
def abs_sum(y_pre,y_tru):
    y_pre=np.array(y_pre)
    y_tru=np.array(y_tru)
    loss=sum(sum(abs(y_pre-y_tru)))
    return loss

### 模型导入 

In [98]:
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [99]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2021
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    test = np.zeros((test_x.shape[0],4))

    cv_scores = []
    onehot_encoder = OneHotEncoder(sparse=False)
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'num_class': 4,
                'num_leaves': 2 ** 5,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': seed,
                'nthread': 28,
                'n_jobs':24,
                'verbose': -1,
            }

            model = clf.train(params, 
                      train_set=train_matrix, 
                      valid_sets=valid_matrix, 
                      num_boost_round=2000, 
                      verbose_eval=100, 
                      early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration) 
            
        val_y=np.array(val_y).reshape(-1, 1)
        val_y = onehot_encoder.fit_transform(val_y)
        print('预测的概率矩阵为：')
        print(test_pred)
        test += test_pred
        score=abs_sum(val_y, val_pred)
        cv_scores.append(score)
        print(cv_scores)
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    test=test/kf.n_splits

    return test

In [100]:
def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_test
lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[100]	valid_0's multi_logloss: 0.0632077
[200]	valid_0's multi_logloss: 0.0449957
[300]	valid_0's multi_logloss: 0.0396666
[400]	valid_0's multi_logloss: 0.0380727
[500]	valid_0's multi_logloss: 0.0379505
[600]	valid_0's multi_logloss: 0.0393102
Early stopping, best iteration is:
[464]	valid_0's multi_logloss: 0.0376704
预测的概率矩阵为：
[[9.99971486e-01 2.67832448e-05 1.06624957e-06 6.64270531e-07]
 [4.79367053e-05 8.32752641e-04 9.99119299e-01 1.11858956e-08]
 [4.63707947e-07 2.91127974e-08 4.79706185e-07 9.99999027e-01]
 ...
 [9.47633967e-02 3.27018357e-04 9.04905626e-01 3.95920347e-06]
 [9.99949978e-01 4.99668495e-05 3.31761133e-08 2.15781483e-08]
 [9.88260978e-01 9.96092107e-04 4.72206174e-03 6.02086772e-03]]
[573.6034641201848]
************************************ 2 ************************************
Training until validation scores don't improve for

## 预测结果

In [101]:
temp=pd.DataFrame(lgb_test)
result=pd.read_csv('sample_submit.csv')
result['label_0']=temp[0]
result['label_1']=temp[1]
result['label_2']=temp[2]
result['label_3']=temp[3]
result.to_csv('submit.csv',index=False)