In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from dateutil.parser import parse
from sklearn.metrics import classification_report
from sklearn.utils import shuffle 

from imblearn.combine import SMOTETomek, SMOTEENN

from feature_engineering.sum_value import sum_all_columns
from feature_engineering.nan_stastics import nan_statics
from feature_engineering.rank_feature_majority import rank_feature_majority_all, rank_feature_majority_train_valid_test
from feature_engineering.segment_raw_data import segment_raw_data
from feature_engineering.rank_feature import rank_feature, rank_feature_by_max, rank_feature_count
from model_selection.classifier_model_factory import ClassifierModelFactory
from model_selection.regressor_model_factory import RegressorModelFactory
from model_selection.multi_classifier_model_factory import MultiClassifierModelFactory
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from model_selection.cv import k_fold_regressor, k_fold_classifier, create_sample_k_fold_regressor, k_fold_multi_classifier
from sampling.sample import sample_by_test_scale, separate_high_median_normal, separate_high_normal
from utils import create_scale_feature, normalize_data_frame, delete_error_data, filtration, create_sample, logloss_to_class, softmax_to_class

  from ._conv import register_converters as _register_converters


In [2]:
train = pd.read_csv('input/d_train_20180102.csv', encoding='gb2312')
test = pd.read_csv('input/d_test_A_20180102.csv', encoding='gb2312')

In [3]:
train_data = train.iloc[:, 1:-1]
train_target = train.iloc[:, -1]
test_data = test.iloc[:, 1:]

In [4]:
train_data['性别'] = train_data['性别'].apply(lambda x:1 if x == '男' else 0)
test_data['性别'] = test_data['性别'].apply(lambda x:1 if x == '男' else 0)

In [5]:
train_data['体检日期'] = (pd.to_datetime(train_data['体检日期']) - parse('2016-10-09')).dt.days
test_data['体检日期'] = (pd.to_datetime(test_data['体检日期']) - parse('2016-10-09')).dt.days

In [6]:
columns = train_data.columns
str_columns = ['sex', 'age', 'date'] + ['f' + str(p) for p in range(len(columns)-3)]

In [7]:
def value_to_multi_class(x):
    if x < 6.1:
        return 0
    elif (x >= 6.1) & (x <7):
        return 1
    else:
        return 2

In [8]:
train_data.columns = str_columns
test_data.columns = str_columns
train_target.name = 'Y'
train_target_class = train_target.apply(lambda x: value_to_multi_class(x))
train_target_class.name = 'class'

In [9]:
train_test = pd.concat([train_data, test_data], axis=0)
train_test, factors = normalize_data_frame(train_test, start_index=2)
train_data = train_test.iloc[:train_data.shape[0]]
test_data = train_test.iloc[train_data.shape[0]:]

In [10]:
train_data = sum_all_columns(train_data)
test_data = sum_all_columns(test_data)

In [12]:
train_data, test_data = nan_statics(train_data, test_data)

执行 Nan Statics


In [13]:
train_data.head()

Unnamed: 0,sex,age,date,f0,f1,f2,f3,f4,f5,f6,...,f29,f30,f31,f32,f33,f34,f35,f36,sum_all,nan_count_all
0,1,41,100.0,3.511332,4.607334,21.805089,1.898362,45.393363,81.744091,34.201624,...,37.647059,54.33526,8.254398,53.576248,38.80814,23.744292,20.888889,17.142857,71.383191,5
1,1,41,42.857143,3.419548,7.24382,12.588945,9.942105,51.311209,74.246129,41.627199,...,29.411765,13.294798,14.749662,50.74224,42.44186,20.547945,20.888889,22.857143,63.977609,5
2,1,46,50.549451,2.537008,3.029452,11.587067,4.353777,67.092133,75.224124,52.723275,...,18.823529,49.710983,10.622463,45.479082,47.674419,29.223744,14.222222,22.857143,64.34208,5
3,0,22,49.450549,1.164953,2.099164,14.544316,1.896993,31.701091,59.005705,33.660352,...,42.352941,16.184971,14.749662,36.842105,56.686047,24.657534,20.444444,14.285714,56.364173,5
4,0,48,50.549451,2.3605,2.939231,15.031024,2.239163,48.10861,50.0815,49.32341,...,51.764706,34.682081,20.838972,56.950067,37.209302,35.616438,2.666667,17.142857,44.573542,8


In [14]:
train_data.fillna(-99, inplace=True)
test_data.fillna(-99, inplace=True)

In [15]:
train_data_target = pd.concat([train_data, train_target], axis=1)

In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data_target, train_target_class, test_size=0.2, random_state=20)

In [17]:
sm = SMOTETomek()
X_resampled, y_resampled = sm.fit_sample(X_train, y_train.reshape(-1,1))

  
  y = column_or_1d(y, warn=True)


In [18]:
X_train = pd.DataFrame(X_resampled, columns= X_train.columns)
y_train = pd.Series(y_resampled, name='class')

In [19]:
X_y_train = pd.concat([X_train, y_train], axis=1)
X_y_train = shuffle(X_y_train)
X_train = X_y_train.iloc[:, :-1]
y_train = X_y_train.iloc[:, -1]

In [20]:
X_train_data = X_train.iloc[:, :-1]
X_valid_data = X_valid.iloc[:, :-1]

In [21]:
lgb_y_valid = \
    k_fold_multi_classifier(X_train_data, y_train, X_valid_data, MultiClassifierModelFactory.MODEL_LIGHET_GBM, cv=5)

开始CV5折训练...
第0次训练...
Training until validation scores don't improve for 300 rounds.
[300]	valid_0's multi_logloss: 0.510898
[600]	valid_0's multi_logloss: 0.305843
[900]	valid_0's multi_logloss: 0.215262
[1200]	valid_0's multi_logloss: 0.167321
[1500]	valid_0's multi_logloss: 0.13842
[1800]	valid_0's multi_logloss: 0.119813
[2100]	valid_0's multi_logloss: 0.107349
[2400]	valid_0's multi_logloss: 0.0990827
[2700]	valid_0's multi_logloss: 0.0946949
[3000]	valid_0's multi_logloss: 0.0928216
[3300]	valid_0's multi_logloss: 0.092307
[3600]	valid_0's multi_logloss: 0.0918898
Early stopping, best iteration is:
[3577]	valid_0's multi_logloss: 0.0918745
[[9.56992243e-01 4.29726940e-02 3.50632960e-05]
 [8.50684193e-01 3.17700565e-02 1.17545751e-01]
 [2.28096818e-03 1.79967877e-02 9.79722244e-01]
 ...
 [1.83322673e-03 1.31691682e-03 9.96849856e-01]
 [5.84429264e-04 9.93543338e-01 5.87223316e-03]
 [8.24689772e-04 3.13618047e-04 9.98861692e-01]]
[0, 0, 2, 2, 0, 0, 2, 0, 1, 0, 2, 2, 0, 1, 0, 0, 1, 2

[1200]	valid_0's multi_logloss: 0.174275
[1500]	valid_0's multi_logloss: 0.144969
[1800]	valid_0's multi_logloss: 0.126081
[2100]	valid_0's multi_logloss: 0.113388
[2400]	valid_0's multi_logloss: 0.105622
[2700]	valid_0's multi_logloss: 0.101573
[3000]	valid_0's multi_logloss: 0.0997767
[3300]	valid_0's multi_logloss: 0.0986472
[3600]	valid_0's multi_logloss: 0.0981356
[3900]	valid_0's multi_logloss: 0.0979752
[4200]	valid_0's multi_logloss: 0.0978069
[4500]	valid_0's multi_logloss: 0.0978935
Early stopping, best iteration is:
[4205]	valid_0's multi_logloss: 0.0978002
[[9.99917916e-01 8.15841111e-05 4.99805488e-07]
 [1.05758197e-03 3.30831198e-04 9.98611587e-01]
 [9.99983803e-01 1.31235661e-05 3.07356605e-06]
 ...
 [2.54492285e-01 6.81880420e-01 6.36272955e-02]
 [2.58847833e-04 9.99615002e-01 1.26150045e-04]
 [3.20227442e-05 9.72532989e-01 2.74349886e-02]]
[0, 2, 0, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2, 1, 0, 1, 0, 0, 1, 2, 2, 1, 1, 1, 0, 2, 2, 1, 1, 1, 2, 1, 0, 2, 0, 2, 2, 2, 2, 1, 1, 2, 1, 1

[1800]	valid_0's multi_logloss: 0.122932
[2100]	valid_0's multi_logloss: 0.109594
[2400]	valid_0's multi_logloss: 0.101431
[2700]	valid_0's multi_logloss: 0.0968123
[3000]	valid_0's multi_logloss: 0.0944539
[3300]	valid_0's multi_logloss: 0.0930715
[3600]	valid_0's multi_logloss: 0.0924356
[3900]	valid_0's multi_logloss: 0.0923088
[4200]	valid_0's multi_logloss: 0.0921127
[4500]	valid_0's multi_logloss: 0.0920984
Early stopping, best iteration is:
[4372]	valid_0's multi_logloss: 0.0920363
[[1.10814045e-01 8.63193102e-05 8.89099635e-01]
 [2.80403965e-04 9.98729786e-01 9.89809637e-04]
 [1.65894218e-03 2.36344588e-03 9.95977612e-01]
 ...
 [9.99408825e-01 1.72515988e-05 5.73923393e-04]
 [4.24608754e-01 5.74832548e-01 5.58697769e-04]
 [2.13225950e-04 1.91738781e-04 9.99595035e-01]]
[2, 1, 2, 0, 1, 1, 0, 2, 2, 2, 2, 0, 1, 0, 1, 2, 2, 1, 1, 0, 1, 2, 2, 1, 1, 1, 0, 2, 2, 1, 2, 0, 0, 2, 2, 0, 2, 1, 1, 0, 1, 0, 1, 2, 1, 0, 0, 1, 2, 2, 1, 2, 2, 0, 0, 1, 1, 2, 1, 0, 2, 1, 2, 0, 0, 0, 1, 0, 2, 2, 0

[2400]	valid_0's multi_logloss: 0.0935888
[2700]	valid_0's multi_logloss: 0.0896595
[3000]	valid_0's multi_logloss: 0.0876039
[3300]	valid_0's multi_logloss: 0.086611
[3600]	valid_0's multi_logloss: 0.0861408
[3900]	valid_0's multi_logloss: 0.0857075
[4200]	valid_0's multi_logloss: 0.0857129
Early stopping, best iteration is:
[3988]	valid_0's multi_logloss: 0.0855702
[[2.48848583e-04 4.07739051e-03 9.95673761e-01]
 [1.97341392e-02 6.40653794e-04 9.79625207e-01]
 [1.46997524e-05 5.02129668e-03 9.94964004e-01]
 ...
 [9.96697804e-01 3.27911541e-03 2.30810413e-05]
 [2.74931654e-04 9.93797982e-01 5.92708617e-03]
 [1.81231034e-03 9.96121385e-01 2.06630441e-03]]
[2, 2, 2, 1, 0, 2, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 2, 1, 2, 0, 0, 2, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 2, 0, 2, 1, 0, 2, 2, 0, 2, 2, 0, 1, 0, 0, 0, 2, 2, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 2, 1, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 0, 1, 2, 2, 2, 2, 1, 1, 2, 0, 0, 1, 2, 1,

[3300]	valid_0's multi_logloss: 0.0767562
[3600]	valid_0's multi_logloss: 0.0757725
[3900]	valid_0's multi_logloss: 0.075091
[4200]	valid_0's multi_logloss: 0.0747299
[4500]	valid_0's multi_logloss: 0.0745588
[4800]	valid_0's multi_logloss: 0.0743822
[5100]	valid_0's multi_logloss: 0.0742575
Early stopping, best iteration is:
[5017]	valid_0's multi_logloss: 0.0742371
[[7.55868719e-02 9.20081711e-01 4.33141681e-03]
 [4.76893561e-03 2.08182940e-04 9.95022881e-01]
 [5.36623112e-06 4.87757311e-04 9.99506876e-01]
 ...
 [9.89801188e-01 1.02158743e-04 1.00966532e-02]
 [1.26519441e-03 9.94288046e-01 4.44675909e-03]
 [9.05079275e-01 1.20389029e-03 9.37168345e-02]]
[1, 2, 2, 0, 1, 1, 1, 2, 1, 0, 0, 0, 0, 2, 1, 0, 2, 1, 2, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 1, 0, 0, 2, 1, 1, 0, 1, 1, 2, 1, 1, 0, 1, 1, 2, 2, 0, 1, 1, 0, 1, 0, 2, 1, 2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 1, 0, 2, 2, 2, 1, 1, 0, 2, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 2, 0, 1, 1, 0, 1, 2, 1, 1, 0, 0, 0, 0, 2, 1, 0, 0,

In [27]:
print(classification_report(y_valid, softmax_to_class(lgb_y_valid, level=0.95)))

             precision    recall  f1-score   support

          0       0.84      1.00      0.91       938
          1       0.00      0.00      0.00        99
          2       0.67      0.04      0.08        92

avg / total       0.75      0.83      0.76      1129



In [40]:
valid = pd.Series(y_valid, name='valid').reset_index(drop=True)
pred = pd.Series(y_pred, name='pred').reset_index(drop=True)

In [41]:
df = pd.DataFrame(valid)

In [42]:
df['pred'] = pred

In [47]:
indexs = df[(df['valid']==0).values & (df['pred']==1).values].index