In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from dateutil.parser import parse
from sklearn.metrics import classification_report

from imblearn.combine import SMOTETomek, SMOTEENN

from feature_engineering.nan_stastics import nan_statics
from feature_engineering.rank_feature_majority import rank_feature_majority_all, rank_feature_majority_train_valid_test
from feature_engineering.segment_raw_data import segment_raw_data
from feature_engineering.rank_feature import rank_feature, rank_feature_by_max, rank_feature_count
from model_selection.classifier_model_factory import ClassifierModelFactory
from model_selection.regressor_model_factory import RegressorModelFactory
from model_selection.multi_classifier_model_factory import MultiClassifierModelFactory
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from model_selection.cv import k_fold_regressor, k_fold_classifier, create_sample_k_fold_regressor
from sampling.sample import sample_by_test_scale, separate_high_median_normal, separate_high_normal
from utils import create_scale_feature, normalize_data_frame, delete_error_data, filtration, create_sample, logloss_to_class, softmax_to_class

  from ._conv import register_converters as _register_converters


In [2]:
train = pd.read_csv('input/d_train_20180102.csv', encoding='gb2312')
test = pd.read_csv('input/d_test_A_20180102.csv', encoding='gb2312')

In [3]:
train_data = train.iloc[:, 1:-1]
train_target = train.iloc[:, -1]
test_data = test.iloc[:, 1:]

In [4]:
train_data['性别'] = train_data['性别'].apply(lambda x:1 if x == '男' else 0)
test_data['性别'] = test_data['性别'].apply(lambda x:1 if x == '男' else 0)

In [5]:
train_data['体检日期'] = (pd.to_datetime(train_data['体检日期']) - parse('2016-10-09')).dt.days
test_data['体检日期'] = (pd.to_datetime(test_data['体检日期']) - parse('2016-10-09')).dt.days

In [6]:
columns = train_data.columns
str_columns = ['sex', 'age', 'date'] + ['f' + str(p) for p in range(len(columns)-3)]

In [7]:
train_data.columns = str_columns
test_data.columns = str_columns
train_target.name = 'Y'
train_target_class = train_target.apply(lambda x: 1 if x > 7 else 0)
train_target_class.name = 'class'

In [8]:
def create_sum_feature(data):
    new_data = data
    columns = data.columns
    for index in range(3, len(columns)-3):
        for j in range(index + 1, len(columns)):
            new_data.insert(new_data.shape[1], 'sum_' + columns[j] + '_' + columns[index], data.iloc[:, j] + data.iloc[:, index])
    return new_data

In [9]:
train_data_target = pd.concat([train_data, train_target, train_target_class], axis=1)

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data_target, train_target_class, test_size=0.2, random_state=20)

In [11]:
X_valid_data = X_valid.iloc[:, :-2]
y_valid_data = X_valid.iloc[:, -1]

In [11]:
high, normal = separate_high_normal(X_train)

In [12]:
X_high_train_data = high.iloc[:,:-2]
y_high_train_data = high.iloc[:, -1]

In [13]:
X_normal_train_data = normal.iloc[:,:-2]
y_normal_train_data = normal.iloc[:, -1]

In [14]:
X_train_datas = [X_high_train_data] + [create_sample(X_high_train_data) for i in range(10)] 

In [15]:
y_train_datas = [y_high_train_data for i in range(11)]

In [16]:
X_train_data = pd.concat(X_train_datas + [X_normal_train_data], axis=0).reset_index(drop=True)
y_train_data = pd.concat(y_train_datas + [y_normal_train_data], axis=0).reset_index(drop=True)

In [17]:
train_valid_test = pd.concat([X_train_data, X_valid_data, test_data], axis=0)
train_valid_test, factors = normalize_data_frame(train_valid_test, start_index=2)
X_train_data = train_valid_test.iloc[:X_train_data.shape[0]]
X_valid_data = train_valid_test.iloc[X_train_data.shape[0]:(X_train_data.shape[0] + X_valid_data.shape[0])]
test_data = train_valid_test.iloc[(X_train_data.shape[0] + X_valid_data.shape[0]):]

In [18]:
X_train_data.fillna(-99, inplace=True)
X_valid_data.fillna(-99, inplace=True)
test_data.fillna(-99, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [19]:
lgb_y_valid, kf_lgb_mse = \
    k_fold_classifier(X_train_data, y_train_data, X_valid_data, ClassifierModelFactory.MODEL_LIGHET_GBM, cv=5)

开始CV 5折训练...
第0次训练...
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.42062
[200]	valid_0's binary_logloss: 0.272338
[300]	valid_0's binary_logloss: 0.195273
[400]	valid_0's binary_logloss: 0.150385
[500]	valid_0's binary_logloss: 0.12416
[600]	valid_0's binary_logloss: 0.107119
[700]	valid_0's binary_logloss: 0.0959359
[800]	valid_0's binary_logloss: 0.0885029
[900]	valid_0's binary_logloss: 0.0825481
[1000]	valid_0's binary_logloss: 0.0779164
[1100]	valid_0's binary_logloss: 0.0741258
[1200]	valid_0's binary_logloss: 0.0714769
[1300]	valid_0's binary_logloss: 0.0695329
[1400]	valid_0's binary_logloss: 0.0679216
[1500]	valid_0's binary_logloss: 0.0669975
[1600]	valid_0's binary_logloss: 0.0660671
[1700]	valid_0's binary_logloss: 0.0655338
[1800]	valid_0's binary_logloss: 0.065476
[1900]	valid_0's binary_logloss: 0.0653766
[2000]	valid_0's binary_logloss: 0.0649009
[2100]	valid_0's binary_logloss: 0.0648768
[2200]	valid_0's binary_loglos

In [20]:
y_pred = logloss_to_class(lgb_y_valid, class_level=0.5)

In [21]:
print(classification_report(y_valid_data, y_pred))

             precision    recall  f1-score   support

          0       0.93      0.99      0.96      1039
          1       0.46      0.12      0.19        90

avg / total       0.89      0.92      0.90      1129



In [22]:
valid = pd.Series(y_valid, name='valid').reset_index(drop=True)
pred = pd.Series(y_pred, name='pred').reset_index(drop=True)

In [23]:
df = pd.DataFrame(valid)

In [24]:
df['pred'] = pred

In [30]:
len(df[(df['valid']==1).values & (df['pred']==1).values])

11

In [26]:
indexs = df[(df['valid']==1).values & (df['pred']==0).values].index

In [27]:
X_valid.iloc[:,-2].reset_index(drop=True)[indexs]

0        9.19
8        9.85
22      13.59
45       8.76
55       7.02
76       9.38
85      10.55
89       7.61
122      8.45
125     13.13
131      8.66
167      7.02
179      9.11
184      9.43
187      7.22
227      8.41
231      9.63
238      7.91
240      8.56
251      7.54
291      7.18
310      7.24
374     15.62
380      7.03
387      7.36
392      8.31
398     11.91
412      7.54
426      7.14
428      8.75
        ...  
722      8.74
723      7.10
749      7.60
751      9.37
765     10.88
774     17.41
795      9.52
818      8.83
825      8.22
843     11.05
845      7.09
851      8.06
908      7.67
918     10.02
945     12.90
948      7.28
966      7.05
976      8.28
992      7.19
997     11.82
1004    11.09
1017    10.21
1056     9.24
1063    10.55
1075    10.01
1080     7.89
1097     7.06
1098     7.01
1116    13.81
1127    13.98
Name: Y, Length: 79, dtype: float64