In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from dateutil.parser import parse
from sklearn.metrics import classification_report

from imblearn.combine import SMOTETomek, SMOTEENN

from feature_engineering.nan_stastics import nan_statics
from feature_engineering.rank_feature_majority import rank_feature_majority_all, rank_feature_majority_train_valid_test
from feature_engineering.segment_raw_data import segment_raw_data
from feature_engineering.rank_feature import rank_feature, rank_feature_by_max, rank_feature_count
from model_selection.classifier_model_factory import ClassifierModelFactory
from model_selection.regressor_model_factory import RegressorModelFactory
from model_selection.multi_classifier_model_factory import MultiClassifierModelFactory
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from model_selection.cv import k_fold_regressor, k_fold_classifier, create_sample_k_fold_regressor
from sampling.sample import sample_by_test_scale, separate_high_median_normal, separate_high_normal
from utils import create_scale_feature, normalize_data_frame, delete_error_data, filtration, create_sample, logloss_to_class, softmax_to_class

  from ._conv import register_converters as _register_converters


In [2]:
train = pd.read_csv('input/d_train_20180102.csv', encoding='gb2312')
test = pd.read_csv('input/d_test_A_20180102.csv', encoding='gb2312')

In [3]:
train = train[(train['血糖'] >= 7).values | (train['血糖'] <= 5.8).values]

In [4]:
train.shape

(4763, 42)

In [5]:
train_data = train.iloc[:, 1:-1]
train_target = train.iloc[:, -1]
test_data = test.iloc[:, 1:]

In [6]:
train_data['性别'] = train_data['性别'].apply(lambda x:1 if x == '男' else 0)
test_data['性别'] = test_data['性别'].apply(lambda x:1 if x == '男' else 0)

In [7]:
train_data['体检日期'] = (pd.to_datetime(train_data['体检日期']) - parse('2016-10-09')).dt.days
test_data['体检日期'] = (pd.to_datetime(test_data['体检日期']) - parse('2016-10-09')).dt.days

In [8]:
columns = train_data.columns
str_columns = ['sex', 'age', 'date'] + ['f' + str(p) for p in range(len(columns)-3)]

In [9]:
train_data.columns = str_columns
test_data.columns = str_columns
train_target.name = 'Y'
train_target_class = train_target.apply(lambda x: 1 if x >= 7 else 0)

In [10]:
def create_sum_feature(data):
    new_data = data
    columns = data.columns
    for index in range(3, len(columns)-3):
        for j in range(index + 1, len(columns)):
            new_data.insert(new_data.shape[1], 'sum_' + columns[j] + '_' + columns[index], data.iloc[:, j] + data.iloc[:, index])
    return new_data

In [11]:
train_test = pd.concat([train_data, test_data], axis=0)
train_test, factors = normalize_data_frame(train_test, start_index=2)
train_data = train_test.iloc[:train_data.shape[0]]
test_data = train_test.iloc[train_data.shape[0]:]

In [12]:
train_data_create = create_sum_feature(train_data)
test_data_create = create_sum_feature(test_data)

In [13]:
train_data_create.fillna(-99, inplace=True)
test_data_create.fillna(-99, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [14]:
train_data_target = pd.concat([train_data_create, train_target], axis=1)

In [15]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data_target, train_target_class, test_size=0.1, random_state=20)

In [16]:
len(X_train)

4286

In [17]:
sm = SMOTETomek()
X_resampled, y_resampled = sm.fit_sample(X_train, y_train.reshape(-1,1))

  
  y = column_or_1d(y, warn=True)


In [18]:
X_train = pd.DataFrame(X_resampled, columns= X_train.columns)
y_train = pd.Series(y_resampled, name='Y')

In [19]:
x_y = pd.concat([X_train, y_train], axis=1)

In [20]:
high, normal = separate_high_normal(x_y)

In [21]:
X_train_data = X_train.iloc[:, :-1]
X_valid_data = X_valid.iloc[:, :-1]

In [22]:
lgb_y_valid, kf_lgb_mse = \
    k_fold_classifier(X_train_data, y_train, X_valid_data, ClassifierModelFactory.MODEL_LIGHET_GBM, cv=5)

开始CV5折训练...
第0次训练...
Training until validation scores don't improve for 300 rounds.
[300]	valid_0's binary_logloss: 0.265893
[600]	valid_0's binary_logloss: 0.165439
[900]	valid_0's binary_logloss: 0.127267
[1200]	valid_0's binary_logloss: 0.109913
[1500]	valid_0's binary_logloss: 0.103475
[1800]	valid_0's binary_logloss: 0.102806
[2100]	valid_0's binary_logloss: 0.102533
Early stopping, best iteration is:
[1879]	valid_0's binary_logloss: 0.102416
第1次训练...
Training until validation scores don't improve for 300 rounds.
[300]	valid_0's binary_logloss: 0.25169
[600]	valid_0's binary_logloss: 0.151938
[900]	valid_0's binary_logloss: 0.112359
[1200]	valid_0's binary_logloss: 0.0934022
[1500]	valid_0's binary_logloss: 0.0851355
[1800]	valid_0's binary_logloss: 0.081891
[2100]	valid_0's binary_logloss: 0.0815079
[2400]	valid_0's binary_logloss: 0.0812476
[2700]	valid_0's binary_logloss: 0.0809976
[3000]	valid_0's binary_logloss: 0.0807586
[3300]	valid_0's binary_logloss: 0.0804118
[3600]	vali

In [31]:
y_pred = logloss_to_class(lgb_y_valid, class_level=0.65)

In [32]:
print(classification_report(y_valid, y_pred))

             precision    recall  f1-score   support

          0       0.93      0.98      0.95       431
          1       0.57      0.28      0.38        46

avg / total       0.89      0.91      0.90       477



In [25]:
valid = pd.Series(y_valid, name='valid').reset_index(drop=True)
pred = pd.Series(y_pred, name='pred').reset_index(drop=True)

In [26]:
df = pd.DataFrame(valid)

In [27]:
df['pred'] = pred

In [28]:
indexs = df[(df['valid']==0).values & (df['pred']==1).values].index