In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import math

from feature_engineering.segment_raw_data import segment_raw_data
from feature_engineering.rank_feature import rank_feature, rank_feature_by_max, rank_feature_count
from model_selection.classifier_model_factory import ClassifierModelFactory
from model_selection.regressor_model_factory import RegressorModelFactory
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from model_selection.cv import k_fold_regressor, balance_k_fold_regressor
from utils import create_scale_feature
from utils import normalize_data_frame
from utils import normalize_feature
from utils import get_euclidean_metric
from utils import get_cosine_angle
from utils import combine_all
from dateutil.parser import parse
from sampling.sample import separate_high_median_normal
# from sampling.sample import sample_by_test_scale



In [2]:
train = pd.read_csv('input/d_train_20180102.csv', encoding='gb2312')
test = pd.read_csv('input/d_test_A_20180102.csv', encoding='gb2312')

In [3]:
train['体检日期'] = (pd.to_datetime(train['体检日期']) - parse('2016-10-09')).dt.days

In [4]:
test['体检日期'] = (pd.to_datetime(test['体检日期']) - parse('2016-10-09')).dt.days

In [5]:
train_data = train.iloc[:, 1:-1]
train_target = train.iloc[:, -1]
# train_target_class = train_target.apply()
test_data = test.iloc[:, 1:]

In [6]:
train_data['性别'] = train_data['性别'].map({'男': 1, '女': 0})
test_data['性别'] = test_data['性别'].map({'男': 1, '女': 0})

In [7]:
train_data.columns

Index(['性别', '年龄', '体检日期', '*天门冬氨酸氨基转换酶', '*丙氨酸氨基转换酶', '*碱性磷酸酶', '*r-谷氨酰基转换酶',
       '*总蛋白', '白蛋白', '*球蛋白', '白球比例', '甘油三酯', '总胆固醇', '高密度脂蛋白胆固醇', '低密度脂蛋白胆固醇',
       '尿素', '肌酐', '尿酸', '乙肝表面抗原', '乙肝表面抗体', '乙肝e抗原', '乙肝e抗体', '乙肝核心抗体',
       '白细胞计数', '红细胞计数', '血红蛋白', '红细胞压积', '红细胞平均体积', '红细胞平均血红蛋白量',
       '红细胞平均血红蛋白浓度', '红细胞体积分布宽度', '血小板计数', '血小板平均体积', '血小板体积分布宽度', '血小板比积',
       '中性粒细胞%', '淋巴细胞%', '单核细胞%', '嗜酸细胞%', '嗜碱细胞%'],
      dtype='object')

In [8]:
important_feature = ['*天门冬氨酸氨基转换酶', '*丙氨酸氨基转换酶', '*碱性磷酸酶', '*r-谷氨酰基转换酶',
       '*总蛋白', '白蛋白',   '甘油三酯', '总胆固醇', '高密度脂蛋白胆固醇', '低密度脂蛋白胆固醇',
       '尿素', '肌酐', '尿酸', '白细胞计数',  '红细胞平均体积', '红细胞体积分布宽度']

In [9]:
important_feature_10 = ['*天门冬氨酸氨基转换酶', '*丙氨酸氨基转换酶', '*碱性磷酸酶', '*r-谷氨酰基转换酶',
       '*总蛋白', '白蛋白',  '甘油三酯', '尿素', '尿酸', '白细胞计数',  '红细胞平均体积', '红细胞体积分布宽度']

In [10]:
# train_data.fillna(train_data.median(axis=0), inplace=True)
# test_data.fillna(test_data.median(axis=0), inplace=True)

In [11]:
columns = train_data.columns
str_columns = ['sex', 'age', 'date'] + ['f' + str(p) for p in range(len(columns)-3)]

In [12]:
train_data.columns = str_columns
test_data.columns = str_columns
train_target.name = 'Y'

In [13]:
train_data_target = pd.concat([train_data, train_target], axis=1)

In [41]:
test_date = test_data['date'].unique()

In [43]:
test_date

array([366, 369, 372, 381, 379, 373, 397, 427, 375, 376, 380, 387, 386,
       382, 383, 374])

ValueError: Lengths must match to compare

In [15]:
test_data.head()

Unnamed: 0,sex,age,date,f0,f1,f2,f3,f4,f5,f6,...,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36
0,1,54,366,23.85,26.69,116.08,34.36,82.75,46.03,36.72,...,12.3,241.0,10.8,12.8,0.26,58.4,33.2,7.5,0.6,0.3
1,1,50,366,29.75,34.98,90.07,111.43,71.9,44.09,27.81,...,12.0,242.0,11.5,14.6,0.28,59.3,29.3,7.7,3.2,0.5
2,1,27,366,,,,,,,,...,12.1,398.0,8.9,9.9,0.35,50.2,40.1,7.9,1.2,0.6
3,0,53,366,17.98,16.63,95.95,23.41,78.16,45.44,32.72,...,12.6,247.0,11.6,13.9,0.29,53.7,38.0,7.2,0.7,0.4
4,0,43,366,19.12,19.8,76.97,15.7,80.76,46.9,33.86,...,12.2,335.0,10.4,11.9,0.35,52.0,39.4,8.0,0.3,0.3


In [38]:
def sample_by_test_scale(train, test):
    train_date = train['date'].unique()
    test_date = test['date'].unique()
    valid_set = []
    for date in test_date:
        test_date_item = test[test['date'] == date]
        train_date_item = train[train['date'] == date]
        scale = test_date_item.shape[0]/train_date_item.shape[0]
        print('date:' + str(date) + '  scale:' + str(scale))
        X_train, X_valid, y_train, y_valid = \
            train_test_split(train_date_item.iloc[:, :-1], train_date_item.iloc[:, -1], test_size=scale, random_state=33)
        valid_item = pd.concat([X_valid, y_valid], axis=1)
        valid_set.append(valid_item)
    valid = pd.concat(valid_set, axis=0)
    print(valid.shape)
    return valid

In [39]:
sample_by_test_scale(train_data_target, test_data)

date:366  scale:0.21176470588235294
date:369  scale:0.7704918032786885
date:372  scale:0.652542372881356
date:381  scale:0.2571428571428571
date:379  scale:0.21367521367521367
date:373  scale:0.580952380952381
date:397  scale:0.6352201257861635
date:427  scale:0.5527950310559007
date:375  scale:0.6416184971098265
date:376  scale:0.2857142857142857
date:380  scale:0.1595744680851064
date:387  scale:0.0660377358490566
date:386  scale:0.11570247933884298
date:382  scale:0.18128654970760233
date:383  scale:0.09202453987730061
date:374  scale:0.9380530973451328
(1001, 41)


Unnamed: 0,sex,age,date,f0,f1,f2,f3,f4,f5,f6,...,f28,f29,f30,f31,f32,f33,f34,f35,f36,Y
5598,1.0,49,366,27.42,42.94,68.18,20.29,73.40,43.89,29.51,...,278.0,10.4,12.4,0.290,58.8,33.8,5.7,1.2,0.5,5.05
5604,1.0,45,366,,,,,,,,...,203.0,11.8,15.7,0.240,57.3,34.3,6.0,1.5,0.9,5.20
767,1.0,27,366,,,,,,,,...,249.0,10.2,12.5,0.250,54.7,35.8,7.8,1.1,0.6,5.39
5615,1.0,38,366,40.69,58.04,89.89,41.94,76.02,46.37,29.65,...,293.0,11.3,15.2,0.330,50.4,37.1,8.9,2.5,1.1,5.44
5639,0.0,80,366,22.95,21.51,106.37,32.88,75.29,40.43,34.86,...,323.0,10.4,12.6,0.340,61.1,30.1,6.0,2.2,0.6,5.24
3174,1.0,55,366,29.93,23.89,62.09,30.80,74.57,45.47,29.10,...,175.0,11.4,14.3,0.200,49.7,37.4,10.0,2.1,0.8,5.43
5630,0.0,42,366,18.59,19.43,68.40,16.63,70.17,42.34,27.83,...,240.0,9.4,9.4,0.220,73.0,20.3,5.7,0.9,0.1,5.43
5596,0.0,32,366,17.00,16.01,71.24,16.44,76.32,47.07,29.25,...,287.0,11.1,13.5,0.320,58.0,34.5,5.7,1.1,0.7,5.08
5628,1.0,40,366,45.84,38.42,80.96,123.39,84.03,49.40,34.63,...,162.0,11.7,16.0,0.190,57.1,28.5,6.6,7.2,0.6,5.52
5627,0.0,50,366,20.36,25.00,118.56,38.07,79.70,48.24,31.46,...,289.0,11.4,14.5,0.330,59.3,29.1,9.0,2.2,0.4,17.43


In [49]:
l1 = [1,2,3]
l2 = [1,2,3]
l3 = [1,2,3]
l4 = [1,2,3]

In [50]:
ls = [l1, l2, l3, l4]

In [52]:
sum(l1)

6

In [41]:
train_data['f5'] = train_data['f5'].apply(lambda x: x if (x > 35) | np.isnan(x) else 35)

In [40]:
ss.head()

0    49.60
1    47.76
2    48.00
3    44.02
4    41.83
Name: f5, dtype: float64

In [19]:
train_data['f17']

0        NaN
1        NaN
2       0.01
3        NaN
4        NaN
5       0.01
6        NaN
7        NaN
8       0.03
9       0.02
10       NaN
11       NaN
12       NaN
13       NaN
14       NaN
15       NaN
16       NaN
17       NaN
18       NaN
19       NaN
20       NaN
21       NaN
22       NaN
23       NaN
24       NaN
25       NaN
26       NaN
27       NaN
28       NaN
29       NaN
        ... 
5612    0.03
5613    0.01
5614     NaN
5615     NaN
5616    0.04
5617     NaN
5618     NaN
5619     NaN
5620     NaN
5621     NaN
5622     NaN
5623     NaN
5624     NaN
5625     NaN
5626    0.03
5627     NaN
5628     NaN
5629     NaN
5630     NaN
5631     NaN
5632     NaN
5633    0.04
5634    0.03
5635     NaN
5636     NaN
5637     NaN
5638     NaN
5639    0.01
5640     NaN
5641     NaN
Name: f17, Length: 5642, dtype: float64

In [12]:
# train_data = train_data.drop(['f15', 'f16', 'f17', 'f18', 'f19'], axis=1)

In [13]:
train_data, factors = normalize_data_frame(train_data, start_index=2)

In [14]:
train_data.head()

Unnamed: 0,性别,年龄,*天门冬氨酸氨基转换酶,*丙氨酸氨基转换酶,*碱性磷酸酶,*r-谷氨酰基转换酶,*总蛋白,白蛋白,*球蛋白,白球比例,...,红细胞体积分布宽度,血小板计数,血小板平均体积,血小板体积分布宽度,血小板比积,中性粒细胞%,淋巴细胞%,单核细胞%,嗜酸细胞%,嗜碱细胞%
0,1.0,41,3.511332,4.607334,21.805089,1.898362,45.393363,81.744091,34.201624,19.69697,...,14.728682,18.220339,34.567901,54.33526,18.263473,53.576248,38.80814,16.915423,20.888889,17.142857
1,1.0,41,3.419548,7.24382,12.588945,9.942105,51.311209,74.246129,41.627199,15.0,...,19.379845,33.898305,25.925926,13.294798,32.634731,50.74224,42.44186,13.432836,20.888889,22.857143
2,1.0,46,2.537008,3.029452,11.587067,4.353777,67.092133,75.224124,52.723275,11.212121,...,16.27907,28.813559,14.814815,49.710983,23.502994,45.479082,47.674419,22.885572,14.222222,22.857143
3,0.0,22,1.164953,2.099164,14.544316,1.896993,31.701091,59.005705,33.660352,16.818182,...,13.178295,30.367232,39.506173,16.184971,32.634731,36.842105,56.686047,17.910448,20.444444,14.285714
4,0.0,48,2.3605,2.939231,15.031024,2.239163,48.10861,50.0815,49.32341,9.545455,...,44.186047,39.40678,49.382716,34.682081,46.107784,56.950067,37.209302,29.850746,2.666667,17.142857


In [15]:
train_data.fillna(-99, inplace=True)

In [16]:
# group_array = []
# index = len(important_feature)-1
# ct = 0
# result = [item for item in range(index)]
# combine_all(important_feature,ct,result,index, group_array)

In [17]:
train_data_target = pd.concat([train_data, train_target], axis=1)

In [18]:
train_data_target_1_25_45_high = train_data_target[(train_data_target.Y > 11).values & (train_data_target['性别'] == 1).values & (train_data_target['年龄'] >= 25).values & (train_data_target['年龄'] <= 45).values]

In [19]:
train_data_target_1_25_45 = train_data_target[(train_data_target['性别'] == 1).values & (train_data_target['年龄'] >= 25).values & (train_data_target['年龄'] <= 45).values]

In [20]:
train_data_target_1_25_45_high = train_data_target_1_25_45_high.loc[:, list(important_feature_10) + list('Y')]

In [21]:
train_data_target_1_25_45 = train_data_target_1_25_45.loc[:, list(important_feature_10) + list('Y')]

In [32]:
train_data_target_1_25_45_high

Unnamed: 0,*天门冬氨酸氨基转换酶,*丙氨酸氨基转换酶,*碱性磷酸酶,*r-谷氨酰基转换酶,*总蛋白,白蛋白,甘油三酯,尿素,尿酸,白细胞计数,红细胞平均体积,红细胞体积分布宽度,Y
127,1.195547,2.889107,18.275744,3.446341,53.91042,79.91035,4.09201,-99.0,-99.0,33.023001,41.851852,9.302326,16.3
282,1.701537,3.488582,20.979678,3.238301,47.412393,84.229829,5.35109,29.268293,28.851524,16.100767,47.777778,10.852713,13.19
332,2.553482,4.725625,9.663004,5.24342,59.572987,77.587612,14.939467,29.268293,32.605014,42.223439,54.074074,10.852713,14.13
443,1.880398,3.420414,30.611374,3.501088,36.853098,61.328443,2.058111,-99.0,-99.0,13.307777,45.0,8.527132,13.98
611,2.520534,4.625378,21.062219,3.807673,38.291947,59.290954,7.506053,28.931876,29.574757,20.372399,50.925926,6.976744,11.46
736,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,13.624895,35.732922,12.102957,57.592593,15.503876,11.81
941,12.579134,58.110953,36.642568,15.442837,27.222093,64.262429,6.8523,50.294365,30.017242,31.708653,57.222222,3.875969,14.87
1216,19.907745,30.094031,21.00814,14.331467,30.540729,63.080685,15.27845,11.438183,43.593891,28.696605,43.518519,14.728682,18.51
1534,8.227625,12.949857,17.475949,13.142083,54.304943,78.117359,5.98063,34.146341,28.752346,40.525739,54.62963,7.751938,12.01
1536,2.546422,7.358101,18.736836,7.883607,41.053609,73.431133,4.72155,36.164844,55.338043,38.006572,47.962963,9.302326,11.09


In [23]:
train_data_target_1_25_45.head()

Unnamed: 0,*天门冬氨酸氨基转换酶,*丙氨酸氨基转换酶,*碱性磷酸酶,*r-谷氨酰基转换酶,*总蛋白,白蛋白,甘油三酯,尿素,尿酸,白细胞计数,红细胞平均体积,红细胞体积分布宽度,Y
0,3.511332,4.607334,21.805089,1.898362,45.393363,81.744091,2.51816,36.753574,35.203467,13.910186,60.925926,14.728682,6.06
1,3.419548,7.24382,12.588945,9.942105,51.311209,74.246129,6.150121,31.623213,56.166557,26.560789,52.777778,19.379845,5.39
6,3.513685,5.094533,24.491945,8.083435,54.537016,68.704156,3.002421,39.02439,49.959566,21.96057,45.555556,8.527132,5.11
8,2.744111,7.82926,22.750043,8.312005,43.629612,80.562347,4.358354,29.268293,35.801584,22.782037,71.481481,6.976744,5.66
17,3.452496,6.042865,20.697899,4.326403,56.973776,58.312958,8.716707,44.32296,55.920902,29.189485,4.259259,59.689922,4.66


In [52]:
def get_cosine(vec1, vec2):
    np_vec1, np_vec2 = np.array(vec1), np.array(vec2)
    return np_vec1.dot(np_vec2)/(math.sqrt((np_vec1 ** 2).sum()) * math.sqrt((np_vec2 ** 2).sum()))

In [87]:
def get_cosine_angle(vec1, vec2):
    cosine = get_cosine(vec1, vec2)
    if cosine > 1:
        cosine = 1
    elif cosine < -1:
        cosine = -1
    return math.acos(cosine) / math.pi * 180

In [84]:
np.math.acos(1)/ math.pi * 180

0.0

In [92]:
n = 736

In [93]:
dists = []
for index in train_data_target_1_25_45.index:
    dist = get_euclidean_metric(train_data_target_1_25_45.ix[index, :-1].values, train_data_target_1_25_45_high.ix[n, :-1].values)
    dists.append(dist)

In [94]:
angles = []
for index in train_data_target_1_25_45.index:
    angle = get_cosine_angle(train_data_target_1_25_45.ix[index, :-1].values, train_data_target_1_25_45.ix[n, :-1].values)
    angles.append(angle)

In [95]:
dists = pd.Series(dists,name='dist', index = train_data_target_1_25_45.index)
angels = pd.Series(angles,name='angle', index = train_data_target_1_25_45.index)

In [96]:
dists.sort_values()

736       0.000000
707      11.463078
706      11.711103
762      11.798867
592      12.383307
715      14.378653
754      14.695588
710      16.165645
694      16.303174
674      16.378212
708      17.148358
726      17.633424
745      18.159742
741      19.176592
676      19.766011
709      20.090669
701      20.443730
683      20.933946
677      21.320556
735      21.344109
744      21.426989
760      21.691094
771      21.874090
742      21.908970
691      21.924420
761      22.172475
702      22.586462
718      22.718637
690      23.216952
759      23.484070
           ...    
466     370.301958
133     371.692521
1581    372.223683
1575    372.471162
434     372.741336
442     372.897543
5432    373.580350
2186    373.615208
436     373.667098
1582    373.817024
125     373.863872
128     373.978545
429     374.074204
1568    374.397865
1579    374.495147
438     374.971082
428     376.330700
439     377.133269
127     377.159209
1577    377.436143
1570    377.514657
1583    378.

In [97]:
angels.sort_values()

736       0.000000
707       2.418539
706       2.468459
762       2.488806
592       2.600366
715       3.034399
754       3.085836
710       3.378666
674       3.397281
694       3.440264
708       3.595235
726       3.672516
745       3.798809
741       4.035961
676       4.169961
709       4.202171
701       4.290637
683       4.301852
744       4.466920
677       4.501817
735       4.506937
760       4.579122
691       4.592777
761       4.596523
771       4.598879
742       4.626653
702       4.744943
718       4.757378
759       4.872552
690       4.893616
           ...    
2323    115.605872
5501    115.630468
5432    115.650074
1233    115.711327
251     115.823543
2654    115.941287
937     116.049045
4405    116.107289
5437    116.327345
2386    116.576293
1190    116.652740
4142    116.746456
941     116.929137
3689    117.026191
2033    117.074772
4816    117.103711
5531    117.479389
274     117.689498
4106    117.918164
2186    118.158877
2172    118.301121
435     118.

In [98]:
train_data_target_1_25_45.ix[angels[angels < 7].index].Y

558     5.04
592     5.83
674     5.44
675     5.08
676     6.47
677     5.58
679     5.52
680     5.80
681     4.84
682     5.43
683     5.47
684     5.04
686     5.65
687     5.90
688     5.70
690     5.63
691     5.73
694     5.26
695     5.28
696     5.03
697     5.01
700     4.77
701     5.44
702     4.84
704     5.71
706     5.04
707     5.17
708     5.59
709     5.19
710     4.80
       ...  
730     5.17
734     4.90
735     5.35
736    11.81
739     5.29
740     5.31
741     6.01
742     5.64
743     5.20
744     4.99
745     4.97
750     4.92
751     5.24
753     7.68
754     4.78
756     4.66
757     8.25
758     4.74
759     4.89
760     5.10
761     4.95
762     5.62
763     5.62
764     5.60
765     4.86
766     4.74
767     5.39
769     5.50
771     5.41
772     6.47
Name: Y, Length: 69, dtype: float64