In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from dateutil.parser import parse
from sklearn.metrics import classification_report

from imblearn.combine import SMOTETomek, SMOTEENN

from feature_engineering.nan_stastics import nan_statics
from feature_engineering.rank_feature_majority import rank_feature_majority_all, rank_feature_majority_train_valid_test
from feature_engineering.segment_raw_data import segment_raw_data
from feature_engineering.rank_feature import rank_feature, rank_feature_by_max, rank_feature_count
from model_selection.classifier_model_factory import ClassifierModelFactory
from model_selection.regressor_model_factory import RegressorModelFactory
from model_selection.multi_classifier_model_factory import MultiClassifierModelFactory
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from model_selection.cv import k_fold_regressor, k_fold_classifier, create_sample_k_fold_regressor
from sampling.sample import sample_by_test_scale, separate_high_median_normal, separate_high_normal
from utils import create_scale_feature, normalize_data_frame, delete_error_data, filtration, create_sample, logloss_to_class, softmax_to_class



In [2]:
train = pd.read_csv('input/d_train_20180102.csv', encoding='gb2312')
test = pd.read_csv('input/d_test_A_20180102.csv', encoding='gb2312')

In [3]:
train_data = train.iloc[:, 1:-1]
train_target = train.iloc[:, -1]
test_data = test.iloc[:, 1:]

In [4]:
train_data['性别'] = train_data['性别'].apply(lambda x:1 if x == '男' else 0)
test_data['性别'] = test_data['性别'].apply(lambda x:1 if x == '男' else 0)

In [5]:
train_data['体检日期'] = (pd.to_datetime(train_data['体检日期']) - parse('2016-10-09')).dt.days
test_data['体检日期'] = (pd.to_datetime(test_data['体检日期']) - parse('2016-10-09')).dt.days

In [6]:
columns = train_data.columns
str_columns = ['sex', 'age', 'date'] + ['f' + str(p) for p in range(len(columns)-3)]

In [7]:
train_data.columns = str_columns
test_data.columns = str_columns
train_target.name = 'Y'
train_target_class = train_target.apply(lambda x: 1 if x > 7 else 0)

In [8]:
train_test = pd.concat([train_data, test_data], axis=0)
train_test, factors = normalize_data_frame(train_test, start_index=2)
train_data = train_test.iloc[:train_data.shape[0]]
test_data = train_test.iloc[train_data.shape[0]:]

In [9]:
train_data.fillna(-99, inplace=True)
test_data.fillna(-99, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [48]:
train_data_target = pd.concat([train_data, train_target], axis=1)

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data_target, train_target_class, test_size=0.2, random_state=20)

In [11]:
len(X_train)

4513

In [17]:
sm = SMOTETomek()
X_resampled, y_resampled = sm.fit_sample(X_train, y_train.reshape(-1,1))

  
  y = column_or_1d(y, warn=True)


In [18]:
X_train = pd.DataFrame(X_resampled, columns= X_train.columns)
y_train = pd.Series(y_resampled, name='Y')

In [19]:
x_y = pd.concat([X_train, y_train], axis=1)

In [20]:
high, normal = separate_high_normal(x_y)

In [24]:
len(normal)

4148

In [25]:
lgb_y_valid, kf_lgb_mse = \
    k_fold_classifier(X_train, y_train, X_valid, ClassifierModelFactory.MODEL_LIGHET_GBM, cv=5)

开始CV 5折训练...
第0次训练...
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.45623
[200]	valid_0's binary_logloss: 0.348721
[300]	valid_0's binary_logloss: 0.255673
[400]	valid_0's binary_logloss: 0.203835
[500]	valid_0's binary_logloss: 0.171151
[600]	valid_0's binary_logloss: 0.150553
[700]	valid_0's binary_logloss: 0.134498
[800]	valid_0's binary_logloss: 0.123126
[900]	valid_0's binary_logloss: 0.112939
[1000]	valid_0's binary_logloss: 0.105394
[1100]	valid_0's binary_logloss: 0.0992504
[1200]	valid_0's binary_logloss: 0.0934813
[1300]	valid_0's binary_logloss: 0.0885561
[1400]	valid_0's binary_logloss: 0.0845685
[1500]	valid_0's binary_logloss: 0.0812488
[1600]	valid_0's binary_logloss: 0.0784726
[1700]	valid_0's binary_logloss: 0.0763924
[1800]	valid_0's binary_logloss: 0.0749584
[1900]	valid_0's binary_logloss: 0.0736601
[2000]	valid_0's binary_logloss: 0.0725968
[2100]	valid_0's binary_logloss: 0.0715414
[2200]	valid_0's binary_logloss:

In [38]:
y_pred = logloss_to_class(lgb_y_valid, class_level=0.7)

In [39]:
print(classification_report(y_valid, y_pred))

             precision    recall  f1-score   support

          0       0.93      0.99      0.96      1039
          1       0.50      0.14      0.22        90

avg / total       0.90      0.92      0.90      1129



In [40]:
valid = pd.Series(y_valid, name='valid').reset_index(drop=True)
pred = pd.Series(y_pred, name='pred').reset_index(drop=True)

In [41]:
df = pd.DataFrame(valid)

In [42]:
df['pred'] = pred

In [47]:
indexs = df[(df['valid']==0).values & (df['pred']==1).values].index