In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from dateutil.parser import parse

from model_selection.classifier_model_factory import ClassifierModelFactory
from model_selection.multi_classifier_model_factory import MultiClassifierModelFactory
from model_selection.cv import k_fold_classifier
from model_selection.cv import logloss_2_class
from sampling.sample import separate_high_normal

from imblearn.combine import SMOTETomek

from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from utils import create_scale_feature, normalize_data_frame

  from ._conv import register_converters as _register_converters


In [2]:
train = pd.read_csv('input/d_train_20180102.csv', encoding='gb2312')
test = pd.read_csv('input/d_test_A_20180102.csv', encoding='gb2312')

In [3]:
train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

In [4]:
train['性别'] = train['性别'].apply(lambda x: 1 if (x == '男') else 0)
test['性别'] = test['性别'].apply(lambda x: 1 if (x == '男') else 0)

In [5]:
train['体检日期'] = (pd.to_datetime(train['体检日期']) - parse('2016-10-09')).dt.days
test['体检日期'] = (pd.to_datetime(test['体检日期']) - parse('2016-10-09')).dt.days

In [6]:
columns = train.columns
str_columns = ['sex', 'age', 'date'] + ['f' + str(p) for p in range(len(columns)-4)]

In [7]:
train.columns = str_columns + ['Y']
test.columns = str_columns

In [9]:
train_test = pd.concat([train.iloc[:, :-1], test], axis=0)

train_test, factors = normalize_data_frame(train_test, start_index=2)
train_data = train_test.iloc[:train.shape[0]]
test_data = train_test.iloc[train.shape[0]:]

In [14]:
def class_y(x):
    if x < 6.1:
        return 0
    elif (x >= 6.1) & (x < 7):
        return 1
    else:
        return 2

In [15]:
train_target = train['Y']
train_target_class = train['Y'].apply(lambda x : class_y(x))

In [9]:
# train_data.fillna(-99, inplace=True)
# test_data.fillna(-99, inplace=True)

In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_target, test_size=0.2, random_state=20)

In [11]:
sm = SMOTETomek()
X_resampled, y_resampled = sm.fit_sample(X_train, y_train.reshape(-1, 1))

  
  y = column_or_1d(y, warn=True)


In [12]:
X_train = pd.DataFrame(X_resampled, columns=X_train.columns)

In [13]:
y_train = pd.Series(y_resampled, name='Y')

In [11]:
lgb_y_valid, kf_lgb_as = \
    k_fold_classifier(X_train, y_train, X_valid, ClassifierModelFactory.MODEL_LIGHET_GBM, cv=5)

开始CV 5折训练...
第0次训练...
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.444768
[200]	valid_0's binary_logloss: 0.388747
[300]	valid_0's binary_logloss: 0.372598
[400]	valid_0's binary_logloss: 0.369138
[500]	valid_0's binary_logloss: 0.367736
[600]	valid_0's binary_logloss: 0.368973
Early stopping, best iteration is:
[536]	valid_0's binary_logloss: 0.367092
第1次训练...
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.439819
[200]	valid_0's binary_logloss: 0.380813
[300]	valid_0's binary_logloss: 0.36449
[400]	valid_0's binary_logloss: 0.359719
[500]	valid_0's binary_logloss: 0.357582
[600]	valid_0's binary_logloss: 0.357897
Early stopping, best iteration is:
[523]	valid_0's binary_logloss: 0.357385
第2次训练...
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.433535
[200]	valid_0's binary_logloss: 0.371166
[300]	valid_0's binary_logloss: 0.352595
[400]

In [12]:
y_pred = logloss_2_class(lgb_y_valid)

In [13]:
y_valid

3796    1
3105    0
674     0
2237    0
3629    1
3800    0
5326    0
66      0
1678    1
5152    0
3455    0
5139    0
1708    0
2555    0
5109    0
5618    0
5229    0
5000    0
3727    1
1198    0
106     0
731     0
1849    1
4675    1
3659    0
5574    0
4723    1
4298    0
4005    0
797     0
       ..
5341    0
2397    1
5544    0
3643    0
398     0
3356    0
963     1
5095    0
2064    0
3474    0
3883    0
977     0
421     0
987     0
1080    0
2828    0
105     0
3478    1
36      0
5196    0
1704    1
3674    0
3907    0
598     0
440     0
1885    0
463     0
4715    0
443     1
1328    0
Name: Y, Length: 1129, dtype: int64

In [14]:
print(classification_report(y_valid, y_pred , target_names=['0','1']))

             precision    recall  f1-score   support

          0       0.85      0.99      0.91       941
          1       0.68      0.14      0.23       188

avg / total       0.82      0.85      0.80      1129



In [15]:
ss = pd.DataFrame(pd.Series(y_pred, name='pred'))

In [16]:
ss['valid'] = y_valid.reset_index(drop=True)

In [22]:
len(ss[(ss['pred'] == 1).values & (ss['valid'] == 0).values])

12

In [33]:
np.where(a1 == np.max(a1))[0][0]

1

In [46]:
a1 = [1, 2,1,3]
a2 = [2, 1, 0,3]
a3 = [2, 1,1,0]
ss = [a1, a2, a3]

In [47]:
def softmax_2_class(data):
    classes = []
    for index in range(len(data)):
        max_index = np.where(data[index] == np.max(data[index]))[0][0]
        classes.append(max_index)
    return classes

In [48]:
softmax_2_class(ss)

[3, 3, 0]