In [17]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from feature_engineering.segment_raw_data import segment_raw_data
from feature_engineering.rank_feature import rank_feature, rank_feature_by_max, rank_feature_count
from model_selection.classifier_model_factory import ClassifierModelFactory
from model_selection.regressor_model_factory import RegressorModelFactory
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from model_selection.cv import k_fold_regressor, balance_k_fold_regressor
from utils import create_scale_feature
from utils import normalize_data_frame
from utils import normalize_feature
from utils import get_euclidean_metric
from utils import get_cosine_angle
from sampling.sample import separate_high_median_normal

In [2]:
train = pd.read_csv('input/d_train_20180102.csv', encoding='gb2312')
test = pd.read_csv('input/d_test_A_20180102.csv', encoding='gb2312')

In [3]:
train_data = train.iloc[:, 1:-1]
train_target = train.iloc[:, -1]
# train_target_class = train_target.apply()
test_data = test.iloc[:, 1:]

In [4]:
train_data['性别'] = train_data['性别'].map({'男': 1, '女': 0})
test_data['性别'] = test_data['性别'].map({'男': 1, '女': 0})

In [5]:
train_data.columns

Index(['性别', '年龄', '体检日期', '*天门冬氨酸氨基转换酶', '*丙氨酸氨基转换酶', '*碱性磷酸酶', '*r-谷氨酰基转换酶',
       '*总蛋白', '白蛋白', '*球蛋白', '白球比例', '甘油三酯', '总胆固醇', '高密度脂蛋白胆固醇', '低密度脂蛋白胆固醇',
       '尿素', '肌酐', '尿酸', '乙肝表面抗原', '乙肝表面抗体', '乙肝e抗原', '乙肝e抗体', '乙肝核心抗体',
       '白细胞计数', '红细胞计数', '血红蛋白', '红细胞压积', '红细胞平均体积', '红细胞平均血红蛋白量',
       '红细胞平均血红蛋白浓度', '红细胞体积分布宽度', '血小板计数', '血小板平均体积', '血小板体积分布宽度', '血小板比积',
       '中性粒细胞%', '淋巴细胞%', '单核细胞%', '嗜酸细胞%', '嗜碱细胞%'],
      dtype='object')

In [6]:
train_data = train_data.drop(['体检日期'], axis=1)
test_data = test_data.drop(['体检日期'], axis=1)

In [7]:
# train_data.fillna(train_data.median(axis=0), inplace=True)
# test_data.fillna(test_data.median(axis=0), inplace=True)

In [8]:
columns = train_data.columns
str_columns = ['sex', 'age'] + ['f' + str(p) for p in range(len(columns)-2)]

In [9]:
train_data.columns = str_columns
test_data.columns = str_columns
train_target.name = 'Y'

In [10]:
train_data = train_data.drop(['f15', 'f16', 'f17', 'f18', 'f19'], axis=1)

In [11]:
train_data, factors = normalize_data_frame(train_data, start_index=2)

In [12]:
train_data.head()

Unnamed: 0,sex,age,f0,f1,f2,f3,f4,f5,f6,f7,...,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36
0,1.0,41,3.511332,4.607334,21.805089,1.898362,45.393363,81.744091,34.201624,19.69697,...,14.728682,18.220339,34.567901,54.33526,18.263473,53.576248,38.80814,16.915423,20.888889,17.142857
1,1.0,41,3.419548,7.24382,12.588945,9.942105,51.311209,74.246129,41.627199,15.0,...,19.379845,33.898305,25.925926,13.294798,32.634731,50.74224,42.44186,13.432836,20.888889,22.857143
2,1.0,46,2.537008,3.029452,11.587067,4.353777,67.092133,75.224124,52.723275,11.212121,...,16.27907,28.813559,14.814815,49.710983,23.502994,45.479082,47.674419,22.885572,14.222222,22.857143
3,0.0,22,1.164953,2.099164,14.544316,1.896993,31.701091,59.005705,33.660352,16.818182,...,13.178295,30.367232,39.506173,16.184971,32.634731,36.842105,56.686047,17.910448,20.444444,14.285714
4,0.0,48,2.3605,2.939231,15.031024,2.239163,48.10861,50.0815,49.32341,9.545455,...,44.186047,39.40678,49.382716,34.682081,46.107784,56.950067,37.209302,29.850746,2.666667,17.142857


In [13]:
train_data.fillna(-99, inplace=True)

In [65]:
dists = []
for index in range(train_data.shape[0]):
    if index == 137:
        continue
    dist = get_euclidean_metric(train_data.iloc[index].values, train_data.iloc[137].values)
    print(dist)
    dists.append(dist)

60.4933903828
61.8955607585
75.0097411101
68.6813010272
235.561441234
79.3826917843
53.0619245681
54.580445893
50.679823766
54.3050356209
55.9803022611
52.8248997619
70.1474559954
78.206623284
54.1105593212
58.0646747168
104.422081547
97.1220147344
52.0517066363
52.8582790598
61.7505201108
70.8727497445
113.572653682
60.6946144686
60.8564617832
42.4077452803
51.9104019679
42.1587164632
70.1696287731
50.3615784359
56.8301595107
64.9873380446
82.5921513247
32.9296201462
65.4195224192
50.4273278337
64.2381940682
63.6097678584
69.5688501018
64.4289928797
63.390648268
72.1927889098
53.2936238411
52.2153076673
74.3890401272
60.3230244319
62.9838027728
51.9493495971
49.8607813262
62.9963518991
59.9150795763
56.7127045492
48.1626338367
78.7892447524
70.7317494873
70.883210001
50.0622537291
45.743559774
91.4808450956
79.3514445906
371.115011205
78.5464538563
47.9744279451
93.6602647341
48.3001197988
70.4936209201
494.26917104
495.343141365
495.593986335
494.475941504
78.9595183187
493.656961727

47.5737350458
77.0432524672
40.1461175427
60.3713728148
42.4321304419
80.1986832627
54.0485111757
54.5515385723
50.0095574383
50.3080184934
55.0998084456
440.48936617
443.595114185
440.06566016
440.842997049
443.712382449
444.539576278
442.461219271
441.14351422
440.901409983
442.2922478
442.370580489
440.521352201
441.145574425
444.392755228
440.880506318
443.115697208
441.256064109
440.55472693
442.58554878
442.793027467
439.880835439
441.145313733
441.097333386
441.788060213
444.533239745
441.850881687
440.854161376
441.61020833
439.749746256
442.330877124
441.374563875
441.3525756
440.835413313
441.20070781
442.304585974
443.055287176
440.954977554
449.123497049
442.835943324
440.992819095
442.577638374
442.332098539
441.476297077
441.699694841
442.217311672
444.261521028
441.913404178
441.751950029
444.452626745
443.278265444
442.011181403
443.008988964
441.120264824
448.254443763
443.196507264
441.712492481
441.85765568
442.020596536
442.188971232
441.284001392
443.98931451
443.1

84.8806868878
73.5886837367
63.9355873447
67.951799966
66.1836118973
64.4400580068
61.0506861808
66.0108335173
68.0932191337
232.652109388
238.425365452
226.353551995
226.534423334
226.500254981
229.724792856
225.443761408
231.716870777
62.4474846691
66.8399741009
493.335030671
495.970535452
494.962021369
493.546273016
68.3654612519
54.015273906
101.369618467
69.9507149623
67.0260228264
54.0947705451
74.6553991009
61.041477022
60.9974588999
54.8898214019
43.8494231702
56.424477884
53.2164823624
51.7207552262
74.0909674601
71.1638519697
231.717753785
235.342624148
227.649298624
227.968244115
228.537430885
234.267270412
228.331025101
230.580199216
232.987960948
244.470109081
228.275066775
228.674859698
228.203340261
229.123943143
227.467836877
227.934684032
226.99870108
231.844383596
227.524661243
55.8501842565
61.0474357669
44.3700817784
55.3842449056
79.6091665906
65.2086236986
58.6076247416
68.5590385939
54.6992710051
47.8525091602
72.5288391519
65.3369977568
68.7678185164
58.83221099

49.5300922353
66.0864533716
62.1529028873
77.4457699948
70.2558771655
51.351096624
51.8652049389
40.4946553038
116.601937953
53.8415396781
75.9533140218
96.0289482299
50.8852695418
64.5886106057
68.6077403447
43.0672400464
79.6421813885
65.6396403021
42.5155527187
225.651797746
228.787433051
229.740252048
229.975411699
227.74637364
231.130496481
230.15134
228.153201811
231.546990568
69.171135085
60.4700805248
33.7432415221
63.1728671362
54.4407203892
76.6333110896
58.6014043744
53.4810171513
77.7340989281
58.0042024093
83.5408347523
39.2430146021
58.2467621571
43.7698992567
61.8575225293
64.2272959817
58.1413817865
58.9875782494
53.6364774389
74.0097220157
54.1329064862
62.0679199286
53.2337692632
55.042427532
51.6478299729
112.167436296
61.3787373062
61.2184736829
61.9187121663
55.8884851109
69.0605755301
55.5323832472
70.3781850438
74.68743015
51.7686204563
63.2175932204
51.3336733325
74.7027093512
41.6586059787
50.0315642209
40.6504226065
47.4245112437
57.2262445963
57.7923183342
50

75.3736159074
70.4088938375
60.9929983391
50.6774373593
58.9478409688
71.4166749555
495.058159639
495.543778277
84.3361477817
80.2248119226
75.7067567855
67.2610153239
84.785646435
71.4149347076
494.759685352
495.502530747
493.571405712
496.228808772
494.319731556
494.654840345
494.185171432
495.076667165
494.297913339
494.971101137
496.37400785
495.999561879
493.950000537
493.672941544
496.045129412
495.238980931
494.594603124
493.162963074
493.801497924
494.482681065
494.706383427
493.889331041
493.494060485
495.462972013
498.361012597
493.668634
495.057160869
495.813092152
493.006533385
495.3090572
495.811680091
495.395738894
494.565731437
493.543736315
495.817668462
494.198202378
493.849536467
493.52601933
495.097683173
493.977738514
493.778718725
493.1400026
492.265117911
493.852623375
493.776917403
493.91454577
494.127453357
493.137861374
493.80421763
494.567107606
496.730440779
493.34453885
493.838769607
496.175763401
494.724244742
493.5938131
492.714322613
494.880282581
492.474

494.267376778
493.294309621
493.802273629
494.865674798
493.994862127
495.987805131
493.478179043
495.014400467
495.284929118
495.564987127
496.163420805
495.045459448
495.077082003
494.728048029
496.993994908
496.134682865
493.247402436
494.446721436
494.214650307
493.669713318
493.323568346
494.248377128
493.477299568
492.934501323
493.659084311
493.817546173
495.295064468
494.327798045
493.518585506
495.687575412
493.606445983
499.236431323
492.681470726
495.634517647
493.89356663
497.485837578
493.848115782
496.745843943
493.244674689
493.983330437
493.695876387
495.296500624
494.796282246
494.857748009
494.34030728
500.587573501
496.445427513
496.575277602
494.060872406
493.706646476
495.010373885
494.612112593
493.336921399
494.164940471
494.757364159
493.728381242
495.703744659
492.743468931
492.920518256
493.697607859
494.830426522
496.833271085
494.599307961
493.749873907
493.940510254
495.362178547
494.587370153
493.91659891
493.981999103
493.325315317
494.170007138
494.13069

63.488071664
58.0677891575
69.5757370435
73.5009905112
63.890160314
45.0880476848
46.7893401819
67.8394913983
67.8204071277
62.9718296543
49.1435673301
67.359548706
70.4393637909
56.7523920735
51.8360004524
72.4769355474
83.7608317449
103.620977076
70.7438654619
60.3862263743
62.3474332842
56.0594399243
72.7692105587
39.065958878
70.6293467432
70.2253925849
47.704193213
70.1535428519
71.8560772405
62.1483380246
46.1097381612
54.0998857489
59.0980739948
51.299422172
69.8044503499
76.2413008568
57.5780373505
65.2626522556
82.0396525227
68.2573565921
39.655430038
54.0894563433
63.161976906
54.0925013779
56.4015570338
50.277270113
55.0537826342
58.7713928359
60.0598448694
54.4154030699
61.5866706958
54.3607911308
58.8096256711
55.6105125493
54.4884436065
60.1989092834
65.7709358678
60.6846932052
59.4900882753
70.4794356876
53.6369268342
52.5782568599
66.3641490864
45.1024519091
49.2552040273
60.9675592854
68.137644355
62.2836231988
47.7152467411
81.7682706187
63.0968226628
43.2264178218
43

65.4094660932
55.5890823882
495.011241736
70.8708830426
53.7355023156
493.291439501
67.9520361934
496.579451466
71.3961204599
499.351951822
77.7300290273
82.0078408386
56.213671989
55.5576731855
55.6974837129
42.979562511
69.7265432772
44.3517379485
60.5976931704
68.5294439342
493.367322516
63.2325543442
53.7678456927
55.0349121966
59.0857444902
64.0110618501
69.5636744021
52.6748336223
51.1988929235
57.9738753918
56.4674551968
51.7162055948
59.206961133
54.4865450224
53.5252944138
66.0798758438
71.3097509533
253.127999442
67.0443638009
52.3454935953
52.0481976506
74.8384362752
432.49109373
66.1667883585
49.7786781944
104.66126379
68.9918568767
230.352358083
58.5864409003
72.1689264206
67.2492780184
59.8325329486
66.4049232586
252.294370643
49.0097707414
251.669213206
72.8457832456
66.7234348265
71.468123904
75.9988770666
65.2879839297
372.163019433
496.668193767
495.14997469
74.3449912967
65.7666898836
81.0005718297
59.0278300464
51.888035193
68.9085147941
46.44037248
63.0054287711
57

In [66]:
dists = pd.Series(dists,name='dist')

In [67]:
dists.sort_values()

4258     28.189848
5413     29.825588
257      30.031086
5394     30.975364
2766     32.072005
3553     32.413385
3562     32.507075
33       32.929620
3668     33.568343
449      33.646787
2136     33.743242
5626     34.107719
330      34.210794
1685     34.419916
4346     35.328829
3509     35.484072
4438     35.546520
1994     35.561361
1024     35.779172
4860     35.999419
965      36.045626
328      36.126620
186      36.186430
4851     36.247634
2923     36.494532
3984     36.539396
4939     36.552316
4501     36.564105
5527     36.679296
258      36.738678
           ...    
2519    498.754197
3812    499.236431
1959    499.272443
3948    499.324700
5460    499.351952
4044    499.466422
3074    499.539212
3153    499.554131
3826    500.587574
3977    500.831439
3112    501.215190
5191    501.848192
3110    502.254836
3125    502.966059
5269    568.631074
1144    568.768806
4564    568.793356
3479    568.944870
2480    568.949200
201     569.030775
2312    569.166124
5239    569.

In [56]:
angles = []
for index in range(train_data.shape[0]):
    if index == 137:
        continue
    dist = get_cosine_angle(train_data.iloc[index].values, train_data.iloc[137].values)
    print(dist)
    angles.append(dist)

15.831970968852191
16.550416956696935
19.943876514008856
18.388807342349864
60.226464198018455
21.1604289076199
12.814067301850761
14.506716082788863
13.25657548658646
14.51108002669596
15.027902345159662
14.142004137528271
18.95699192048279
21.225693854694896
12.948983380846776
15.583411111865304
25.066245279527532
25.483096855555583
13.004839045152524
12.364150970507621
12.707494765683322
17.75345764051421
31.704542781575284
16.25440180618999
16.062260347755103
11.223332625172251
13.741069309155954
9.983324778974076
18.91051858113473
12.078469422873027
15.233020722334008
17.237672541187493
21.710222374226262
8.727088679244433
17.580432252758243
13.499368540144859
17.20681096328924
16.641828113441562
18.27396782895813
17.364390644851976
16.34399784065352
16.815623658622073
13.002227748762001
13.351948384889818
20.154670757196868
15.821992128635467
16.773837768056403
13.695512437467997
13.200534531376142
15.888678359777092
15.061455447995831
14.900519778394894
12.38046311414341
21.4064

14.337839392829165
13.026804749321073
11.851877334988112
18.334213262626726
16.733851744221536
17.534993073550265
12.141916628850659
13.154863672574987
13.438806698577423
14.165006742355915
15.41211942262303
16.127022580502462
13.148993405090417
11.6202037420902
15.046613564128693
11.328702064526368
15.711324852705514
15.191943464731885
13.667340144684609
22.640439726482093
16.265803361475324
13.402724941296363
15.987597168360304
14.383808664903116
13.483169905980644
17.396391370907658
13.41810425657917
15.744531241270218
12.903264351910884
18.114269601762327
16.594331661398094
17.35599846636238
15.539307546514657
15.40789199700982
12.636039068074348
12.947699777600807
13.570527740433212
12.643353652379599
13.582180752513409
12.573307197717586
15.06410603329253
13.806477266347787
15.490554432545128
13.830624207956024
16.190918263950493
11.63543212842694
21.72164480132571
11.609123738216391
13.135126195931896
14.62589516685793
15.284559110274458
10.75823072456946
13.501315986892987
17.7

16.402174553007832
15.851314854615788
57.64745089732432
15.116632079370476
15.29385518918702
20.574809585770947
12.405508770954764
17.857475893600704
15.167444768235011
21.21409291184651
13.270174781318175
19.015479312202906
13.838482669057518
95.97569318418063
93.91632342344937
95.23182573445607
93.39232171001265
86.31437939490263
95.79957516070206
70.85502409954887
9.813325017612298
96.14278625338119
96.39293124666771
95.2917276972408
97.18798629412713
16.27260307735983
119.94565437500721
95.80935727467829
15.23179425239574
17.95677874428809
13.972069709815992
94.41932444083294
70.03620703688823
96.86738095392363
11.070086194441929
97.03965124820743
15.786944672865003
94.50187131190094
53.87556057270087
93.76559271287337
95.18445594411897
88.51554870526624
55.30143083370854
94.06370627288746
88.28901987277807
94.80163779030131
95.91612221322765
14.80722090319828
58.78430675657762
17.665518231403
94.07722851079853
88.56765460017725
21.161721882373648
59.54886728756157
90.3859616108371

10.796039716677877
12.971082421837409
14.009354765989658
17.048848262787416
14.562121224858373
14.387851834894045
16.347400499182278
11.576984828637718
12.860105362067127
13.383623472355426
15.059735665481853
19.196514858949346
9.373679768475474
14.279233939573972
9.770369056142576
17.272001915191975
17.824507613331452
20.109823559450057
10.079126777600095
12.117167926450957
13.645041721490498
10.154164052826713
20.57260540166956
17.68208180887086
56.15874487649224
52.91087908923609
55.52856183210805
56.97301183999113
54.56491200208121
56.63318095368284
57.06725122457964
55.400403560059125
61.150566122326694
55.379133904149256
55.91553529530712
52.12517590972637
53.934458617994466
55.61756880801456
57.63504816987515
57.75767877747234
55.94857914735897
57.7577936048212
54.680430869651545
57.36025117462123
59.03754236964127
55.71349769058972
58.2801306458417
49.995786145993506
55.19822498541607
55.735248146937224
59.06053885500785
61.9423090245079
57.64384932586167
22.8838769151005
15.31

14.613307279178589
15.345176923908653
30.266512854207555
14.46675832509923
11.37562845327213
10.589764405144102
18.962372517502764
19.213732425097547
12.291972820674363
15.528472698600616
16.832534948257646
13.52894887020023
12.626379640330049
11.774326818942395
18.474146806815057
21.882944744901145
13.99348229244838
14.736901851954975
9.93482905949186
13.686331368759003
17.126859586771094
13.046126772118633
15.372250563937607
14.199729542097188
14.020592851301785
118.72592518410927
11.83728976370858
22.24311344049718
14.002638906758103
58.15816913039293
15.408556899350566
16.552650370196663
11.439985576017095
15.338998717009666
10.944466449282988
16.74748581399943
16.57545169307857
13.107010775947373
15.21936085630374
17.483642973767694
10.242436100056489
21.829737050908438
13.27956976650542
18.95849103176135
15.687471362977087
12.756347937146057
15.950008085715595
13.864745928064718
10.623742139854784
13.536411386166176
15.431286056320113
13.576498984662617
15.624152507807699
14.1138

16.42843075134633
15.734964634162582
12.996593029218596
18.964692955220826
14.536300181764123
18.9445083173874
18.91287035522223
13.728383222199925
20.77939375220423
15.85643489938891
10.574580489587776
11.672652154751338
14.804958755947855
14.224171488173019
31.787952712797082
10.241169063907394
13.515466633502182
13.910425537101931
13.060438691651107
12.473422574163013
15.071399807685205
20.143484283844973
15.585598116064311
10.03587681424832
94.39228235097168
16.59971848739012
18.784018833181523
19.065261575466415
24.852231115476762
14.097403881733248
18.685323528677994
18.857032819306152
13.005671850347655
11.698535847377844
12.987758492724
13.085407808600385
13.223616321217293
15.740458635586444
16.01693346126545
14.5216170801667
15.193208015062002
17.677120249000065
18.971634535154678
13.91977596276365
25.794890669572
13.723438216586647
17.31160646840944
16.699560762879216
15.831268201990099
18.59194119647286
10.495937029419895
14.799806687254376
16.80266171516648
29.455914080704

11.971805345683716
11.284117564099725
14.155465754622433
13.167901260917702
12.64098152121939
13.129012584507441
16.39327911657219
14.626768572397268
12.70547725590664
13.478420239889527
13.731361371707479
85.40442526056273
20.30386979203295
13.351278403469882
14.103867459197284
13.845859793409282
59.472445681669804
14.552327475970865
26.40819504254856
13.750600168917975
14.142608748228419
16.218891349885016
12.748501878408335
18.67887791208716
14.64582710316289
14.853626037295951
89.22456286959292
11.386974419740575
16.88637120972619
24.425302331206122
15.733027687508988
117.90714979120364
15.770528560783347
16.03323513186903
13.167696294812512
21.912020743048785
12.103120614694332
16.35400938266817
121.56502730221081
18.91930889665617
13.137509635257661
16.849569354508162
85.47418340990292
15.188645058698302
13.663290777499606
15.024050916085814
14.388454386863804
13.8248040306467
9.837156227199591
15.976459868112022
14.414753149959672
13.70715648686648
14.449075501093166
17.86430466

96.33175951040356
98.7156731564821
96.42202728883784
97.94476178636641
96.74901079568805
98.89749916152152
97.3466187582375
96.18798013743144
97.45447317948339
96.79863396518365
98.85587259512151
97.45461182570627
97.74981168511387
100.61339260586708
98.66121126832718
102.55975196196371
96.46152229632555
97.04673883382142
98.52390619091179
100.97436780141116
99.10520529233582
95.9480252672463
9.744971902201756
98.02902915811615
98.00011016023474
98.96870532899557
99.35576720794555
97.898994006224
96.5378376257794
97.32309683039169
96.828337111345
98.44751019518344
95.79780003038447
96.641914686116
98.19754342124932
98.30312122879502
97.55651257151523
97.14459341015093
96.92407031794463
96.52958469636609
96.58704320089916
97.81781986199417
97.67032754620062
96.63661744538734
98.86620694204103
99.02189684882849
98.34997908211025
98.0838967411702
99.21291269443829
96.9272956948835
99.4908790402648
97.1394098336661
98.49534195295733
97.67791880386689
97.63338322272796
97.17282329148168
96.

12.776769097219866
56.69795118052483
95.5973306344907
16.67010232098237
16.176180923953364
13.944617611890255
16.91459564578119
21.094408830934967
13.646618295033793
16.846016611509803
14.86205822002701
17.70365551310569
55.246598879490016
13.617734194590058
21.957753559298894
11.27694943431166
13.192765529221477
19.317893436752268
14.906643229597666
9.68849255707764
12.256890624641734
14.361811538994193
15.510831440357668
13.079636645187342
96.77349592032327
13.291211643267175
18.213989352141418
16.955375030163804
12.040906694698785
55.703429362513376
14.564319506275227
14.47686058668434
14.610845320046076
13.657690746757824
10.193954946655037
13.30517231413042
13.781602915215426
12.312642175382825
14.541233721988455
14.07687123943344
19.092457701153766
11.046514522670492
9.396230910257666
12.530677117738074
15.394224871634663
29.29224918927578
95.6578278556401
12.951534605755594
16.745347008133006
16.521998852006966
10.888787107737588
19.189653172362632
19.329665664165276
12.22274444

19.490268702872196
21.61682180728974
15.77105355074676
12.956627127886952
21.10469605411276
18.727173248084984
13.18997611072603
21.132170614584957
26.72527052960372
15.490611221584055
17.365633741406974
15.423457168699015
14.878788116108561
14.297208866360755
11.701080858911107
14.890917908283807
16.083577209526705
14.728343029340788
94.8075188370797
14.278565977283952
10.714134818558088
39.890162446806606
13.526012978131542
13.596831743185389
21.11685475730133
11.459890225001429
15.611744661075372
14.746691156272593
11.873411683102356
98.8054166859214
98.60601046243107
18.04698801878665
53.55317905579291
10.35618275933788
57.691171325949156
15.073686312222414
78.30885059706526
15.62495933089326
78.6827194358763
10.708768347221563
12.575549535328987
16.17790693335205
16.90817696954607
16.7566138525766
10.750929920559383
75.59894505680742
22.577852874778404
71.48104898033982
11.081866673826916
13.70849959404698
14.459859689315795
13.97766635145804
16.49958143480037
13.906668030408582
1

56.342936030721035
57.177013154341594
53.433318579058266
56.21327727830149
56.79522391058245
57.160276629350804
56.67758070976792
56.636117851338796
57.6690590628535
57.84727292852731
55.80843975171728
57.92713784601041
17.4808501336653
15.293172351377947
20.294904580391023
16.93380810958879
97.14887991255908
11.985531948367065
18.57543094948798
16.184416132396972
16.64982863930749
17.760477155105086
15.530679555024474
18.844796426529598
13.262845479568718
21.962940274041596
16.849345166956493
16.791356423508752
12.702467333615004
18.84628851169178
54.65688672067158
13.727016689159543
19.080864681195155
11.522802447831326
18.70954775424331
15.29870033521451
13.422091854818268
14.589378783765115
67.98941360185903
14.43379647107536
18.01104854345197
94.95725476791922
12.253127211383246
12.150904568036555
19.359372388490396
20.963618675073473
16.261146727520696
20.9934668875983
18.424507217111113
9.89010305803693
80.11600311037202
98.20374279920051
15.146889362769988
14.647202184571679
13

In [57]:
angel = pd.Series(angles,name='angle')

In [58]:
angel[angel < 9]

33      8.727089
257     7.824015
449     8.861808
947     8.777485
1024    8.703112
1685    8.980521
2136    8.708702
2145    8.890216
2694    8.956932
2766    8.544056
2923    8.942236
3553    8.430383
3562    8.660725
3668    8.757919
4258    7.248077
4851    8.885915
5394    8.211030
5413    7.913827
5626    8.997533
Name: angle, dtype: float64

In [61]:
angel_sim = angel[angel < 9]

In [63]:
angel_sim.index

Int64Index([  33,  257,  449,  947, 1024, 1685, 2136, 2145, 2694, 2766, 2923,
            3553, 3562, 3668, 4258, 4851, 5394, 5413, 5626],
           dtype='int64')

In [68]:
dists[angel_sim.index]

33      32.929620
257     30.031086
449     33.646787
947     48.161197
1024    35.779172
1685    34.419916
2136    33.743242
2145    39.243015
2694    37.294519
2766    32.072005
2923    36.494532
3553    32.413385
3562    32.507075
3668    33.568343
4258    28.189848
4851    36.247634
5394    30.975364
5413    29.825588
5626    34.107719
Name: dist, dtype: float64

In [69]:
train_target.iloc[angel_sim.index].values

array([ 5.31,  5.22,  4.5 ,  4.83,  5.81,  5.29,  4.85,  4.38,  5.63,
        5.34,  4.87,  5.95,  4.35,  8.16,  5.15,  5.91,  4.95,  5.1 ,  5.39])

In [40]:
np.mean(train_target.iloc[angel.index].values)

6.1849999999999996

In [70]:
train_target[137]

5.3200000000000003

In [35]:
train_target[train_target > 10]

120     11.71
127     16.30
139     11.65
158     11.32
248     10.79
249     13.51
250     11.80
252     10.52
282     13.19
332     14.13
392     13.11
416     10.21
443     13.98
448     11.41
511     11.92
611     11.46
736     11.81
818     16.12
941     14.87
994     10.84
999     12.31
1124    10.79
1136    10.52
1197    10.65
1216    18.51
1457    10.56
1516    10.01
1534    12.01
1536    11.09
1567    11.38
        ...  
4591    12.90
4605    14.26
4622    10.18
4627    11.15
4635    13.57
4681    15.70
4689    10.43
4791    11.91
4813    10.34
4833    11.82
4913    13.02
4916    10.16
4952    10.70
5067    12.75
5077    13.02
5131    11.40
5147    13.21
5155    12.67
5172    12.43
5176    11.98
5221    18.95
5237    16.00
5253    10.55
5292    11.01
5360    12.14
5450    10.04
5501    14.28
5593    22.59
5610    11.04
5627    17.43
Name: Y, Length: 139, dtype: float64

In [15]:
dists = np.array(dists)

In [16]:
min(dists)

23.729393079448315

In [None]:
dists = []
for index in range(train_data.shape[0]):
    if index == 7:
        continue
    dist = get_euclidean_metric(train_data.iloc[index].values, train_data.iloc[7].values)
    print(dist)
    dists.append(dist)

In [45]:
np.where(dists == 23.729393079448315)

(array([2090]),)

In [46]:
train_target[7]

5.9400000000000004

In [49]:
train_target[2092]

5.71

In [18]:
def get_cos(vec1, vec2):
    num = vec1.T * vec2
    denom = np.linalg.norm(vec1) * np.linalg.norm(vec2)  
    cos = num / denom  
    sim = 0.5 + 0.5 * cos # 
    return cos

In [30]:
vec1 =np.array([1,0])
vec2 = np.array([0, 1])

In [24]:
get_cos(vec1, vec2)

array([ 0.5,  0.5])

In [27]:
import math

In [28]:
def Cosine(vec1, vec2):
    npvec1, npvec2 = np.array(vec1), np.array(vec2)
    return npvec1.dot(npvec2)/(math.sqrt((npvec1**2).sum()) * math.sqrt((npvec2**2).sum()))

In [31]:
Cosine(vec1, vec2)

0.0

In [38]:
math.acos(0) / math.pi * 180

90.0