-
Notifications
You must be signed in to change notification settings - Fork 718
/
_clean_x.py
1612 lines (1459 loc) · 80.8 KB
/
_clean_x.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright (c) 2023 The InterpretML Contributors
# Distributed under the MIT software license
from collections import Counter
from itertools import count
import numpy as np
import numpy.ma as ma
import logging
_log = logging.getLogger(__name__)
try:
import pandas as pd
_pandas_installed = True
except ImportError:
_pandas_installed = False
try:
import scipy as sp
_scipy_installed = True
except ImportError:
_scipy_installed = False
# BIG TODO LIST:
# - review this entire bin.py file
# - write a cython single instance prediction pathway
# - consider re-writing most of this bin.py functionality in cython for anything that gets used during prediction for speed
# - test: clean_dimensions with ma.masked_array... and other stuff in there
# - test: preclean_X with pd.Series with missing values and maybe a categorical -> gets converted as N features and 1 sample
# - test: preclean_X with list that CONTAINS a ma.masked_array sample entry with missing data and without missing data
# - add better processing for ignored columsn where we return the existing data if we can, and we return all None
# values if not which our caller can detect. Then unify_data can convert that to int(0) values which should work for
# all feature types
# - disable 'ignore' columns temporarily. We need to update C++ to make a distinction because you can have 3 real columns and 5 referencable columsn and our datastructures need to be updated to handle this in C++ first
# - handle the thorny questions of converting float to int for categorical strings
# - in the object converter, convert all int64/uint64 and all floats objects to float64, then use the floor check
# and compare with +-9007199254740991 to decide if they should be expressed as integers or floats
# - after np.unique for categoricals, convert int64 and uint64 types to float64 and then re-run np.unique on those
# values to figure out if there are collisions in the float64 space for integers. We actually have more
# work to do in this case since we'll also get bad reverse indexes with more categories than we have unique values
# Perhaps we can just detect this scenario in the integer space by checking for 9007199254740991 < abs(x) with
# integers and if it's true then convert to float64 before calling np.unique again? It'll be infrequent to have
# such large integers, and we only need to check with int64 and np.uint64 since they are the only ones that can make non-unique floats
# - leave bools as "False"/"True", BUT we have a corner case in _densify_object_ndarray if we have mixed types
# we convert to unicode, and bools become "False"/"True" and then subequently fail the test of being able to
# be converted to floats, so we need to record the bool types and convert them to 0/1 for the conversion to float
# test. First, we can detect if there are any bools via "types = set(map(type, X_col))", then we can
# find all the bools with np.logical_or(X_col == np.array(False), X_col == np.array(True)) or something like that
# - strip leading and trailing spaces when attempting to convert to float BUT NOT FOR STRING CATEGORICALS!
# - def convert_float_category_str(vals):
# vals = vals.astype(np.float64, copy=False)
# integerizable = np.logical_and(vals == np.floor(vals), vals.abs() <= THE_MAX_FLOAT)
# integers = vals[integerizable]
# floats = vals[~integerizable]
# integers = integers.astype(np.int64).astype(np.unicode_)
# floats = integers.astype(np.unicode_) # or perhaps shuttle it to C++
# objs = np.empty(len(vals), dtype=np.object)
# np.place(objs, integerizable, integers)
# np.place(objs, ~integerizable, floats)
# vals = objs.astype(np.unicode_)
# return vals
# - add support for a "ordinal_fast" and "nominal_fast". We would accept these in feature_types as
# a dict of (int/float -> string) for 'ordinal_fast', and (string -> int/float) for 'nominal_fast'
# the we'd write our feature_types_in values as "ordinal_fast" and "nominal_fast" and we'd exepct
# integers in whatever evaluation format we got. This would allow us to accept a float64 numpy array
# and have inside that nominal/ordinal/continuous/missing values that would be highly compressed. Both of these
# would have restriction in that the numbers would have to be contiguous (maybe allowing for compression??) and
# would start from 1, with 0 as reserved for missing values. A big issues is that with this encoding, the
# system on which we do predict needs to also encode them as integers and they have no flexibility to change
# that, except perhaps they could edit the model to change from 'nominal_fast' to 'nominal'
# { "Canada" : 1, "Japan" : 2, "Seychelles" : 3} => string to int mapping -> nominals
# { 1: "low", 2: "medium", 3: "high" } => int to object(string) mapping -> ordinals
# We still record these as ["low", "medium", "high"] and ["Canada", "Japan", "Seychelles"] and we use the
# feature type value to know that these are "ordinal_fast" and "nominal_fast"
# FUTURE TODOS in our callers and in JSON:
# - look into ISO 6093:1985 -> https://www.titanwolf.org/Network/q/4d680399-6711-4742-9900-74a42ad9f5d7/y
# - support "category compression" where we take a number like 10 and compress any categories together that
# have less than that number of samples. Internally, this works well for the prior_categories parameter since
# we can have multiple strings map to identical numbers, so "low" and "medium" can be groups and separate from high
# with {"low": 1, "medium": 1, "high":2} and in JSON we can record these as [["low", "medium"], "high"]
# We support different category compressions for pairs or even individual features since we allow
# separate category definitios per pair axis. Our unify_columns generator can support these by extracting the
# raw data once and then applying different category dictionaries to the raw data and then yielding those
# the caller to the generator can quickly determine which categories we're responding to using the pointer id(..)
# comparisons without examining all the internal dictionary definitions, and we can minimize
# work done by having a single object with a single id(..) pointer that is shared between prior_categories objects
# if they are identical at model load time.
# - if we recieve an unknown float64 value in a 'nominal' or 'ordinal', then check if all the categorical
# value strings are convertible to float64. If that's the case then find the mid-point between the categories
# after they are converted to strings and create a pseudo-continuous value of the feature and figure out where
# the previously unseen float64 should go. WE do need to sort the category strings by float64, but we don't
# to to compute the split points because we can just do a binary search against the categories after they are
# converted to floats and then look at the distance between the upper and lower category and choose the one
# that is closest, and choose the upper one if the distance is equal since then the cut would be on the value
# and we use lower bound semantics (where the value gets into the upper bin if it's exactly the cut value)
# - eventually, we'll want to have an EBMData data frame that'll store just
# floats and integers and convert strings to integers on the fly as data is added
# AND more importantly, you could create this EBMData with a reference to a model
# and then you could populate it with the correct integer mapping, so "low", "medium", "high"
# get populated internally as 1, 2, 3 IDENTICALLY to the model from which the
# EBMData frame was created from. If we get a dataframe from anywhere else then
# we can't be confident the mapping is identical, and we need to use a dictionary
# of some kind, either from string to integer or integer to integer to do the mapping
# so having our own dataframe makes it possible to have faster prediction scenarios
# Unfortunately, taking a Pandas dataframe as input doesn't allow us to escape the hashtable
# step, so whehter we get strings or integers is kind of similar in terms of processing speed
# although hashing strings is slower.
# - the EBMData frame should be constructable by itself without a model reference if it's going to
# be used to train a model, so we sort of have 2 states:
# - 1: no model reference, convert strings to integers using hashes on the fly
# - 2: model reference. Use the model's dictionary mapping initially, but allow new strings or integers
# to be added as necessary, but anything below what the model knows about we map diretly to the right integers
# - we should create post-model modification routines so someone could construct an integer based
# ordinal/categorical and build their model and evaluate it efficiently, BUT when they want
# to view the model they can replace the "1", "2", "3" values with "low", "medium", "high" for graphing
# NOTES:
# - IMPORTANT INFO FOR BELOW: All newer hardware (including all Intel processors) use the IEEE-754 floating point
# standard when encoding floating point numbers. In IEEE-754, smaller whole integers have perfect representations
# in float64 representation. Float64 looses the ability to distinquish between integers though above the number
# 9007199254740991. 9007199254740992 and 9007199254740993 both become 9007199254740992 when converted to float64
# and back to ints. All int32 and uint32 values have perfect float64 representation, but there are collisions
# for int64 and uint64 values above these high numbers.
# - a desirable property for EBM models is that we can serialize them and evaluate them in different
# programming languages like C++, R, JavaScript, etc
# - ideally, we'd have just 1 serialization format, and JSON is a good choice as that format since we can then
# load models into JavaScript easily, and it's also well supported accross other languages as well.
# - JSON also has the benefit that it's human readable, which is important for an intelligible model.
# - JSON and JavaScript have fairly limited support for data types. Only strings and float64 numbers are recognized.
# There are no integer datatypes in JavaScript or JSON. This works for us though since we can use strings to
# encode nominals/ordinals, and float64 values to define 'continuous' cut points.
# - 'continuous' features should always be converted to float64 before discretization because:
# - float64 is more universal accross programming languages. Python's float type is a float64. R only supports
# float64. JavaScript is only float64, etc. GPUs are the excpetion where only float32 are sometimes supported
# but we only do discretization at the injestion point before any GPUs get used, so that isn't a concern.
# - our model definition in JSON is exclusively float64, and we don't to add complexity to indicate if a number
# is a float64 or float32, and even then what would we do with a float32 in JavaScript?
# - float64 continuous values gives us perfect separation and conversion of float32 values, which isn't true
# for the inverse
# - The long double (float80) equivalent is pretty much dead and new hardware doesn't support it. In the off
# chance someone has data with this type then we loose some precision and some values which might have been
# separable will be lumped together, but for continuous values the cut points are somewhat arbitary anyways, so
# this is acceptable.
# - Some big int64 or uint64 values collide when converting to float64 for numbers above 9007199254740991,
# so we loose the ability to distinquish them, but like for float80 values
# this loss in precision is acceptable since continuous features by nature group similar values together.
# The problem is worse for float32, so float64 is better in this regard.
# - 'nominal' and 'ordinal' features are pretty compatible between languages when presented to us as strings
# but the caller can specify that integer/boolean/float values should be treated as 'nominal'/'ordinal' and
# then things become tricky for a number of reasons:
# - it's pretty easy in python and in other languages to silently convert integers to floats. Let's say
# we have a categorical where the possible values are 1, 2, 3, and 4.1, but 4.1 is very unlikely and might
# occur zero times in any particular dataset. If during training our unique values are np.array([1, 2, 3]),
# but during predict time let's say we observe np.array([1, 2, 3, 4.1]). Python will silently convert these to
# floats resulting in np.array([1.0, 2.0, 3.0, 4.1]), and then when we convert to strings we get
# ["1.0", "2.0", "3.0", "4.1"] instead of our original categories of ["1", "2", "3"], so now none of our
# categories match. This would be a very easy mistake to make and would result in a hard to diagnose bug.
# A solution to this problem of silently converting integers to floats would be to change our text conversion
# such that floats which are whole numbers are converted to integers in text. So then we'd get
# ["1", "2", "3", "4.1"] as our categories. We can do this efficiently in python and in many other languages
# by checking if floor(x) == x for float64 values. I think it's also nicer visually in graphs of categoricals
# that any numbers are shown as integers when possible
# - another benefit of making whole number floats as integers is that integer to string conversions are relatively
# easy to do cross-language, but floats are almost never converted to identical strings the same way across
# languages since there are many legal conversions.
# "33.3", "33.299999999999997", "3.3e1", "3.3e+01" are all legal text representations for the float value of 33.3
# - we have an issue in that all numbers above 9007199254740991 (and in fact some numbers below that) will
# be equal to their floor, so will appear to be whole numbers. We don't want 1.0e300 to be converted
# to an integer, so we need some kind of maximum value above which we change to floating point representation
# Since integers don't exist in JavaScript, we can't really represent all numbers above 9007199254740991
# with unique categoricals, so we can't have truely cross-platform integers above that value, so it makes
# sense for us to make all whole numbers equal to or less than 9007199254740991 integers, and any number
# above that point as a floating point. This has the disadvantage that some integers above 9007199254740991
# will have the same categorical strings and be non-separable, but having some collisions in extreme values
# is probably better than the alternative of getting different categorical strings in different programming
# languages where integers do not exist. By making all numbers larger than 9007199254740991 as floating
# point values, the caller will at least see that we're using exponential float representations instead of
# integers, so although they may not understand why we switch to float representation above 9007199254740991
# it will at least be apparent what is happening so they can correct the issues by converting to strings themselves.
# - The only way we could guarantee that identical float64 values in different programming languages generate
# the same text would be if we implemented a float to text converter in C++ (the standard library provides no
# cross platform guarantees), and if we sent our floating point values into C++ for conversion. This is possible
# to do because we only care about performance during predict time for this converstion to strings, and at predict
# time we already know if a feature is nominal/ordinal/continuous, and presumably there aren't too many
# categories because otherwise the feature wouldn't be very useful, so we can pass the relatively few floating
# point values into C++ and get back a single string separated by spaces of the text conversions.
# - if we're presented with an array of np.object_, we can't give a guarantee that unique inputs will generate unique
# categories since the caller could present us with int(0) and "0", or some object type who's __str__ function
# generates a "0". We can't obviously support generalized object types when we serialize to JSON, or any
# other cross-language model serialization format.
# - here's an interesting conundrum. np.float64(np.float32("1.1")) != np.float64("1.1"). Also,
# np.float64(np.float32(1.1)) gives "1.100000023841858". The problem here is that the float32 converter finds
# the float32 value that is closest to 1.1. That value is a float though so if you convert that to a float64
# value all the lower mantissa bits are zeros in the float64 value. If you take the string "1.1" and convert
# it to float64 though the converter will find the closest float64 value where the text after the 1.1... isn't
# required for roundtripping. That float64 will have non-zero bits in the lower mantissa where the float32
# value for "1.1" does not, so they are not equal. This is a problem because if we build an EBM model in one
# language with a float32 and in annother language with a float64 that is the same value we expect them to have
# the same nominal or ordinal string, but they don't. In the language with the float32 value we get "1.1"
# and in the language with the float64 we get "1.100000023841858" and they don't match. The solution is to
# convert all float32 values to float64 in all languages so that we get "1.100000023841858" in both. This feels
# odd since str(my_float32) might give "1.1" so it'll be confusing to the caller, but at least we'll get
# consistent results. I think we need to make the assumption that the caller has the same binary float
# represetation in both langauges. If that's true then any errors are caused by the caller really since
# they are presenting slightly different data in both languages. They should be able to resolve it by using
# float64 everywhere which should be available in all mainstream languages, unlike float32.
# - other double to text and text to double:
# https://github.com/google/double-conversion/blob/master/LICENSE -> BSD-3
# https://stackoverflow.com/questions/28494758/how-does-javascript-print-0-1-with-such-accuracy -> https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf
# https://github.com/juj/MathGeoLib/blob/master/src/Math/grisu3.c -> ?
# https://github.com/dvidelabs/flatcc/blob/master/external/grisu3/grisu3_print.h -> Apache 2.0
# https://github.com/dvidelabs/flatcc/tree/master/external/grisu3
# https://www.ryanjuckett.com/printing-floating-point-numbers/
# https://github.com/catboost/catboost/blob/ff34a3aadeb2e31e573519b4371a252ff5e5f209/contrib/python/numpy/py3/numpy/core/src/multiarray/dragon4.h
# Apparently Numpy has a copy of Ryan Juckett's code liceded in MIT instead of Zlib license
# YES! -> float to string in MIT license:
# https://github.com/numpy/numpy/blob/3de252be1215c0f9bc0a2f5c3aebdd7ffc86e410/numpy/core/src/multiarray/dragon4.h
# https://github.com/numpy/numpy/blob/3de252be1215c0f9bc0a2f5c3aebdd7ffc86e410/numpy/core/src/multiarray/dragon4.c
# - Python uses the gold standard for float/string conversion: https://www.netlib.org/fp/dtoa.c
# https://github.com/python/cpython/blob/main/Python/dtoa.c
# This code outputs the shortest possible string that uses IEEE 754 "exact rounding" using bankers' rounding
# which also guarantees rountrips precicely. This is great for interpretability. Unfortunatetly this means
# that we'll need code in the other languages that generates the same strings and for converting back to floats.
# Fortunately the python C++ code is available and we can use that to get the exact same conversions and make
# that available in other languages to call into the C++ to harmonize floating point formats.
# Python is our premier language and has poor performance if you try to do operations in loops, so we'll
# force all the other platforms to conform to python specifications.
# - when we recieve bool values in python we can probably keep the python string representations of "False" and "True".
# Unlike float64 values, there are just 2 possible bool values and we express them as JavaScript bool items,
# and with just 2 possible values there are no issues with different
# hard to standardize string formats. I like giving the user a little more context of the underlying value in
# the graphs, and "True", "False" are a bit nicer than "false" and "true" or "FALSE" and "TRUE"
# - If our caller gives us strings [" a ", "a"] we will consider those to be two separate categories since the caller
# could have some requirement to keep these as separate categories. Eliminating the whitespace makes it impossible
# for our caller to differentiate these. If the caller wants these to be the same string then they can preprocess this
# aspect themselves.
# - np.unique has some issues. It doesn't like None values. It considers int(4) and float(4.0) to be identical
# it sucks in performance with np.object_ arrays since it uses python comparers. It doesn't call
# __str__ on objects, so we get collisions if the object later converts to a string that is already a category.
# If there are many np.nan values, then the uniques array has many np.nan entries! We've fixed all of these
# by filtering out None and np.nan values, and we've converted objects to a strong types
# - If we aren't given a feature type and we get data that is just [0, 1], should we treat this as
# 'nominal' or a 'continuous' value with a split at 0.5? We'd rather our graphs be bar graphs showing
# a bar for 0 and annother bar for 1, which implies nominal, but this has a problem if the
# feature can rarely be something like 1.1. Maybe we just never saw a 1.1 in our data even though
# it can occur. If this happens then a string label of 1.1 doesn't match '1' and we fail. If
# we treated data this way then it wouldn't really be legal for production systems to not
# specify one of the feature types since an unlikely occurence could produce a nominal type
# from a continuous type and then fail at predict time. Our solution is if we see new categories at predict time
# to check if the new categories are convertible to float64 and if that's true and if all the other prior categories
# that we saw during fit time are also convertible to float64, then we are allowed to switch to treating them as continuous
# during predict time. This way we get to have nice bar graphs of '0' and '1', but we won't generate an error
# if we see 1.1 at predict time since it gets put into the [0.5 +inf) bin. We treat
# [0, 1, 2] and [0, 1, 9] and [1.1, 2.2] the same way and have a threshold of categories below which we treat these
# as cateogoricals during training.
# - If we recieve pure floats from the caller we'll either generate a continuous feature_type and any differences
# in the floating point cut points should be fairly minor. Alternatively, we'll get a 'nominal' which is
# also ok since our floating point strings won't match the ones at fit time and then they'll be converted to
# continuous values and very likely end up in the same bin as the original floats as they'll be very close in value
# since we soft-convert nominals with all float64 values into continuous values when necessary/possible
# - Let's say we get the strings ['0', '00', '0.0', '0.0e10']. If the caller forced this as a nominal we'd have
# 4 values, but if we decided that this should be a 'continuous_auto' value then we'd be converting this to only
# one floating point value, which makes it useless. What this is highlighting is that our unique cutoff point
# where we choose whether a feature should be 'nominal_auto' or 'continuous_auto' should be decided by the number
# of unique float64 values that the strings convert into. Hopefully different platforms get the same floating point
# values based on string inputs, which is annother reason why we should have a consistent C++ implementation.
# - we use the terms ordinal and nominal to indicate different types of categoricals
# (https://en.wikipedia.org/wiki/Ordinal_data). A lot of ML pacakges use categorical instead of the more
# specific term nominal since they don't support ordinals (requiring ordinal data to be handled as
# continuous/numerical). We however, being an interpretable package, want to have a built in oridinal
# feature type so that we can display "low", "medium", "high" instead of 1, 2, 3 on graphs, so
# it makes sense for us to make the distinction of having nominal and ordinal features which are both categoricals
# This also aligns nicely with the pandas.CategoricalDtype which is used to specify both ordinals and nominals.
_disallowed_types = frozenset(
[
complex,
list,
tuple,
range,
bytes,
bytearray,
memoryview,
set,
frozenset,
dict,
Ellipsis,
np.csingle,
np.complex128,
np.clongdouble,
np.void,
]
)
_none_list = [None]
_none_ndarray = np.array(None)
def _densify_object_ndarray(X_col):
# called under: fit or predict
# numpy hierarchy of types
# https://numpy.org/doc/stable/reference/arrays.scalars.html
# TODO: add special case handling if there is only 1 sample to make that faster
types = set(map(type, X_col))
if len(types) == 1:
if str in types:
return X_col.astype(np.str_)
elif bool in types:
return X_col.astype(np.bool_)
if all(one_type is int or issubclass(one_type, np.integer) for one_type in types):
if all(issubclass(one_type, np.unsignedinteger) for one_type in types):
if all(one_type is np.uint8 for one_type in types):
return X_col.astype(np.uint8)
types.discard(np.uint8)
if all(one_type is np.uint16 for one_type in types):
return X_col.astype(np.uint16)
types.discard(np.uint16)
if all(one_type is np.uint32 for one_type in types):
return X_col.astype(np.uint32)
return X_col.astype(np.uint64)
if all(one_type is np.int8 for one_type in types):
return X_col.astype(np.int8)
types.discard(np.int8)
if all(one_type is np.uint8 or one_type is np.int16 for one_type in types):
return X_col.astype(np.int16)
types.discard(np.uint8)
types.discard(np.int16)
if all(one_type is np.uint16 or one_type is np.int32 for one_type in types):
return X_col.astype(np.int32)
try:
return X_col.astype(np.int64)
except OverflowError:
# we must have a big number that can only be represented by np.uint64
# AND also signed integers mixed together if we do X_col.astype(np.uint64),
# it will silently convert negative integers to unsigned!
# TODO : should this be np.float64 with a check for big integers
return X_col.astype(np.str_)
if all(
one_type is float or issubclass(one_type, np.floating) for one_type in types
):
if all(one_type is np.float16 for one_type in types):
return X_col.astype(np.float16)
types.discard(np.float16)
if all(one_type is np.float32 for one_type in types):
return X_col.astype(np.float32)
return X_col.astype(np.float64)
is_float_conversion = False
for one_type in types:
if one_type is str:
pass # str objects have __iter__, so special case this to allow
elif one_type is int:
pass # int objects use the default __str__ function, so special case this to allow
elif one_type is bool:
pass # bool objects use the default __str__ function, so special case this to allow
elif one_type is float:
is_float_conversion = (
True # force to np.float64 to guarantee consistent string formatting
)
elif issubclass(one_type, np.generic):
# numpy objects have __getitem__, so special case this to allow
if one_type is np.float64:
pass # np.float64 is what we convert to for floats, so no need to convert this
elif issubclass(one_type, np.floating):
is_float_conversion = True # force to np.float64 to ensure consistent string formatting of floats
elif one_type in _disallowed_types:
# list of python types primarily from: https://docs.python.org/3/library/stdtypes.html
msg = f"X contains the disallowed type {one_type}"
_log.error(msg)
raise TypeError(msg)
elif hasattr(one_type, "__iter__") or hasattr(one_type, "__getitem__"):
# check for __iter__ and __getitem__ to filter out iterables
# https://stackoverflow.com/questions/1952464/in-python-how-do-i-determine-if-an-object-is-iterable
msg = f"X contains the disallowed iterable type {one_type}"
_log.error(msg)
raise TypeError(msg)
elif hasattr(one_type, "__contains__"):
msg = f"X contains the disallowed set type {one_type}"
_log.error(msg)
raise TypeError(msg)
elif one_type.__str__ is object.__str__:
# if any object in our list uses the default object __str__ function then it'll
# include the id(val) pointer in the string text, which isn't going to be useful as a categorical
# use type(val) instead of val.__str__ to detect inherited __str__ functions per:
# https://stackoverflow.com/questions/19628421/how-to-check-if-str-is-implemented-by-an-object
msg = f"X contains the type {one_type} which does not define a __str__ function"
_log.error(msg)
raise TypeError(msg)
if is_float_conversion:
# TODO: handle ints here too which need to be checked if they are larger than the safe int max value
X_col = X_col.copy()
places = np.fromiter(
(
val_type is float or issubclass(val_type, np.floating)
for val_type in map(type, X_col)
),
np.bool_,
count=len(X_col),
)
np.place(X_col, places, X_col[places].astype(np.float64))
# TODO: converting object types first to pd.CatigoricalDType is somewhat faster than our code here which converts
# to unicode. We should consider either using a CatigoricalDTypes conversion first if pandas is installed, or
# writing our own cython code that can be more efficient at walking through items in an array. If we write
# our own cython there is the added advantage that we can check types in the same loop and therefore eliminate
# the costly "set(map(type, X_col))" calls above
return X_col.astype(np.str_)
def _process_column_initial(X_col, nonmissings, processing, min_unique_continuous):
# called under: fit
if issubclass(X_col.dtype.type, np.floating):
missings = np.isnan(X_col)
if missings.any():
nonmissings = ~missings
X_col = X_col[nonmissings]
elif X_col.dtype.type is np.object_:
X_col = _densify_object_ndarray(X_col)
uniques, indexes, counts = np.unique(X_col, return_inverse=True, return_counts=True)
if issubclass(uniques.dtype.type, np.floating):
floats = uniques.astype(np.float64, copy=False)
uniques = floats.astype(np.str_)
else:
uniques = uniques.astype(np.str_, copy=False)
try:
# we rely here on there being a round trip format within this language from float64 to text to float64
# TODO: does this work if there are spaces or bools?
floats = uniques.astype(dtype=np.float64)
except ValueError:
floats = None
if min_unique_continuous is not None and floats is not None:
# floats can have more than one string representation, so run unique again to check if we have
# min_unique_continuous unique float64s in binary representation
if min_unique_continuous <= len(np.unique(floats)):
floats = floats[indexes] # expand from the unique floats to expanded floats
if nonmissings is not None:
floats_tmp = np.full(len(nonmissings), np.nan, dtype=np.float64)
np.place(floats_tmp, nonmissings, floats)
floats = floats_tmp
return floats, None
# TODO: we need to move this re-ordering functionality to EBMPreprocessor.fit(...) and return a
# np.unicode_ array here. There are two issues with keeping it here
# 1) If the user wants 'nominal_prevalence' in a DP model, then we need to order the prevalence
# by the publically visible noisy weights rather than the private non-noisy prevalences,
# but we don't have access to the noisy weights here. We haven't documented 'nominal_prevalence'
# yet, so nobody should be using it yet, but before we make it public we need to solve this issue
# 2) If we someday want to have an 'eval_set' that has a separate X_eval, then we'll need
# two iterators that operate on different X's. If that happens then the categories dictionary
# needs to be synchronized, so we need access to all the possible categories which is not available
# here
# Since we only really care about speed during predict time, and at predict time we already have a
# categories dictionary, moving this to EBMPreprocessor.fit(...) won't cause any performance issues
# but it's a bit more complicated. Also, we need to think through how we handle categoricals from
# pandas. We can't return an np.unicode_ array there since then we'd loose the ordering that pandas
# gives us, which at a minimum is required for ordinals, and is nice to preserve for nominals because
# it gives the user an easy way to order the nominals on the graph and in the models (for model editing).
#
# Alternatively, if we decide to expose the integer bag definitions instead of having an eval_set then
# we could probably just keep the ordering here and then re-order them again in
# EBMPreprocessor.fit(...) for DP models. If we destroy the information about prevalence and resort
# by noisy prevalence then that would be ok.
# TODO: add a callback function option here that allows the caller to sort, remove, combine
if processing == "nominal_prevalence":
if floats is None:
categories = [(-item[0], item[1]) for item in zip(counts, uniques)]
else:
categories = [
(-item[0], item[1], item[2]) for item in zip(counts, floats, uniques)
]
categories.sort()
categories = [x[-1] for x in categories]
elif processing != "nominal_alphabetical" and floats is not None:
categories = [(item[0], item[1]) for item in zip(floats, uniques)]
categories.sort()
categories = [x[1] for x in categories]
else:
categories = uniques.tolist()
categories.sort()
categories = dict(zip(categories, count(1)))
mapping = np.fromiter(
(categories[val] for val in uniques), np.int64, count=len(uniques)
)
encoded = mapping[indexes]
if nonmissings is not None:
encoded_tmp = np.zeros(len(nonmissings), dtype=np.int64)
np.place(encoded_tmp, nonmissings, encoded)
encoded = encoded_tmp
return encoded, categories
def _encode_categorical_existing(X_col, nonmissings, categories):
# called under: predict
# TODO: add special case handling if there is only 1 sample to make that faster
# if we have just 1 sample, we can avoid making the mapping below
if issubclass(X_col.dtype.type, np.floating):
missings = np.isnan(X_col)
if missings.any():
nonmissings = ~missings
X_col = X_col[nonmissings]
elif X_col.dtype.type is np.object_:
X_col = _densify_object_ndarray(X_col)
uniques, indexes = np.unique(X_col, return_inverse=True)
if issubclass(X_col.dtype.type, np.floating):
uniques = uniques.astype(np.float64, copy=False)
uniques = uniques.astype(np.str_, copy=False)
mapping = np.fromiter(
(categories.get(val, -1) for val in uniques), np.int64, count=len(uniques)
)
encoded = mapping[indexes]
if (mapping < 0).any():
if nonmissings is not None:
encoded_tmp = np.zeros(len(nonmissings), dtype=np.int64)
np.place(encoded_tmp, nonmissings, encoded)
bad = np.full(len(nonmissings), None, dtype=np.object_)
np.place(bad, encoded_tmp < 0, uniques[indexes[encoded < 0]])
encoded = encoded_tmp
else:
bad = np.full(len(encoded), None, dtype=np.object_)
unknowns = encoded < 0
np.place(bad, unknowns, uniques[indexes[unknowns]])
else:
bad = None
if nonmissings is not None:
encoded_tmp = np.zeros(len(nonmissings), dtype=np.int64)
np.place(encoded_tmp, nonmissings, encoded)
encoded = encoded_tmp
return encoded, bad
def _encode_pandas_categorical_initial(X_col, pd_categories, is_ordered, processing):
# called under: fit
if processing == "nominal":
if is_ordered:
msg = "nominal type invalid for ordered pandas.CategoricalDtype"
_log.error(msg)
raise ValueError(msg)
elif processing == "ordinal":
if not is_ordered:
msg = "ordinal type invalid for unordered pandas.CategoricalDtype"
_log.error(msg)
raise ValueError(msg)
elif processing is None or processing == "auto":
pass
elif processing == "nominal_prevalence" or processing == "nominal_alphabetical":
# TODO: we could instead handle this by re-ordering the pandas pd_categories.
# Someone might want to construct it quickly but then override the pd_categories
msg = f"{processing} type invalid for pandas.CategoricalDtype"
_log.error(msg)
raise ValueError(msg)
else:
if isinstance(processing, str):
# don't allow strings to get to the for loop below
msg = f"{processing} type invalid for pandas.CategoricalDtype"
_log.error(msg)
raise ValueError(msg)
n_items = 0
n_ordinals = 0
n_continuous = 0
try:
for item in processing:
n_items += 1
if isinstance(item, str):
n_ordinals += 1
elif (
isinstance(item, float)
or isinstance(item, int)
or isinstance(item, np.floating)
or isinstance(item, np.integer)
):
n_continuous += 1
except TypeError:
msg = f"{processing} type invalid for pandas.CategoricalDtype"
_log.error(msg)
raise ValueError(msg)
if n_continuous == n_items:
msg = "continuous type invalid for pandas.CategoricalDtype"
_log.error(msg)
raise ValueError(msg)
elif n_ordinals == n_items:
if not is_ordered:
msg = "ordinal type invalid for unordered pandas.CategoricalDtype"
_log.error(msg)
raise ValueError(msg)
# TODO: instead of throwing, we could match the ordinal values with the pandas pd_categories and
# report the rest as bad items. For now though, just assume it's bad to specify this
msg = "cannot specify ordinal categories for a pandas.CategoricalDtype which already has categories"
_log.error(msg)
raise ValueError(msg)
else:
msg = f"{processing} type invalid for pandas.CategoricalDtype"
_log.error(msg)
raise ValueError(msg)
categories = dict(zip(pd_categories, count(1)))
# we'll need int64 for calling C++ anyways
X_col = X_col.astype(dtype=np.int64, copy=False)
X_col = X_col + 1
return X_col, categories
def _encode_pandas_categorical_existing(X_col, pd_categories, categories):
# called under: predict
# TODO: add special case handling if there is only 1 sample to make that faster
# if we have just 1 sample, we can avoid making the mapping below
mapping = np.fromiter(
(categories.get(val, -1) for val in pd_categories),
np.int64,
count=len(pd_categories),
)
if len(mapping) <= len(categories):
mapping_cmp = np.arange(1, len(mapping) + 1, dtype=np.int64)
if np.array_equal(mapping, mapping_cmp):
# avoid overflows for np.int8
X_col = X_col.astype(dtype=np.int64, copy=False)
X_col = X_col + 1
return X_col, None
else:
mapping_cmp = np.arange(1, len(categories) + 1, dtype=np.int64)
if np.array_equal(mapping[0 : len(mapping_cmp)], mapping_cmp):
unknowns = len(categories) <= X_col
bad = np.full(len(X_col), None, dtype=np.object_)
bad[unknowns] = pd_categories[X_col[unknowns]]
# avoid overflows for np.int8
X_col = X_col.astype(dtype=np.int64, copy=False)
X_col = X_col + 1
X_col[unknowns] = -1
return X_col, bad
mapping = np.insert(mapping, 0, 0)
encoded = mapping[X_col + 1]
bad = None
unknowns = encoded < 0
if unknowns.any():
bad = np.full(len(X_col), None, dtype=np.object_)
bad[unknowns] = pd_categories[X_col[unknowns]]
return encoded, bad
def _process_continuous(X_col, nonmissings):
# called under: fit or predict
if issubclass(X_col.dtype.type, np.floating):
X_col = X_col.astype(dtype=np.float64, copy=False)
return X_col, None
elif issubclass(X_col.dtype.type, np.integer) or X_col.dtype.type is np.bool_:
X_col = X_col.astype(dtype=np.float64)
if nonmissings is not None:
X_col_tmp = np.full(len(nonmissings), np.nan, dtype=np.float64)
np.place(X_col_tmp, nonmissings, X_col)
X_col = X_col_tmp
return X_col, None
else:
# we either have an np.object_ or np.unicode_/np.str_
try:
floats = X_col.astype(dtype=np.float64)
bad = None
except (TypeError, ValueError):
# we get a TypeError whenever we have an np.object_ array and numpy attempts to call float(), but the
# object doesn't have a __float__ function. We get a ValueError when either a str object inside an
# np.object_ array or when an np.unicode_ array attempts to convert a string to a float and fails
n_samples = len(X_col)
bad = np.full(n_samples, None, dtype=np.object_)
floats = np.zeros(n_samples, dtype=np.float64)
for idx in range(n_samples):
# slice one item at a time keeping as an np.ndarray
one_item_array = X_col[idx : idx + 1]
try:
# use .astype(..) instead of float(..) to ensure identical conversion results
floats[idx] = one_item_array.astype(dtype=np.float64)[0]
except TypeError:
# use .astype instead of str(one_item_array) here to ensure identical string categories
one_str_array = one_item_array.astype(dtype=np.str_)
try:
# use .astype(..) instead of float(..) to ensure identical conversion results
floats[idx] = one_str_array.astype(dtype=np.float64)[0]
except ValueError:
bad.itemset(idx, one_str_array.item())
except ValueError:
bad.itemset(idx, one_item_array.item())
# bad.any() would fail to work if bad was allowed to be either None or False, but None
# values in X_col should always be identified as missing by our caller, and False should be successfully
# converted to 0.0 above, so neither should end up in the bad array other than non-bad indicators
bad = bad if bad.any() else None
if nonmissings is not None:
floats_tmp = np.full(len(nonmissings), np.nan, dtype=np.float64)
np.place(floats_tmp, nonmissings, floats)
floats = floats_tmp
if bad is not None:
bad_tmp = np.full(len(nonmissings), None, dtype=np.object_)
np.place(bad_tmp, nonmissings, bad)
bad = bad_tmp
return floats, bad
def _process_ndarray(X_col, nonmissings, categories, processing, min_unique_continuous):
if processing == "continuous":
# called under: fit or predict
X_col, bad = _process_continuous(X_col, nonmissings)
return "continuous", X_col, None, bad
elif processing == "nominal":
if categories is None:
# called under: fit
X_col, categories = _process_column_initial(X_col, nonmissings, None, None)
return "nominal", X_col, categories, None
else:
# called under: predict
X_col, bad = _encode_categorical_existing(X_col, nonmissings, categories)
return "nominal", X_col, categories, bad
elif processing == "ordinal":
if categories is None:
# called under: fit
# if the caller passes "ordinal" during fit, the only order that makes sense is either
# alphabetical or based on float values. Frequency doesn't make sense
# if the caller would prefer an error, they can check feature_types themselves
X_col, categories = _process_column_initial(X_col, nonmissings, None, None)
return "ordinal", X_col, categories, None
else:
# called under: predict
X_col, bad = _encode_categorical_existing(X_col, nonmissings, categories)
return "ordinal", X_col, categories, bad
elif processing is None or processing == "auto":
# called under: fit
X_col, categories = _process_column_initial(
X_col, nonmissings, None, min_unique_continuous
)
return (
"continuous" if categories is None else "nominal",
X_col,
categories,
None,
)
elif processing == "nominal_prevalence" or processing == "nominal_alphabetical":
# called under: fit
X_col, categories = _process_column_initial(
X_col, nonmissings, processing, None
)
return "nominal", X_col, categories, None
elif (
processing == "quantile"
or processing == "rounded_quantile"
or processing == "uniform"
or processing == "winsorized"
):
# called under: fit
X_col, bad = _process_continuous(X_col, nonmissings)
return "continuous", X_col, None, bad
elif isinstance(processing, int):
# called under: fit
X_col, categories = _process_column_initial(
X_col, nonmissings, None, processing
)
return (
"continuous" if categories is None else "nominal",
X_col,
categories,
None,
)
elif processing == "ignore":
# called under: fit or predict
X_col, categories = _process_column_initial(X_col, nonmissings, None, None)
mapping = np.empty(len(categories) + 1, np.object_)
mapping.itemset(0, None)
for category, idx in categories.items():
mapping.itemset(idx, category)
bad = mapping[X_col]
return "ignore", None, None, bad
elif isinstance(processing, str):
# called under: fit
# don't allow strings to get to the np.array conversion below
msg = f"{processing} type invalid"
_log.error(msg)
raise ValueError(msg)
else:
# called under: fit
n_items = 0
n_ordinals = 0
n_continuous = 0
try:
for item in processing:
n_items += 1
if isinstance(item, str):
n_ordinals += 1
elif (
isinstance(item, float)
or isinstance(item, int)
or isinstance(item, np.floating)
or isinstance(item, np.integer)
):
n_continuous += 1
except TypeError:
msg = f"{processing} type invalid"
_log.error(msg)
raise TypeError(msg)
if n_continuous == n_items:
# if n_items == 0 then it must be continuous since we
# can have zero cut points, but not zero ordinal categories
X_col, bad = _process_continuous(X_col, nonmissings)
return "continuous", X_col, None, bad
elif n_ordinals == n_items:
categories = dict(zip(processing, count(1)))
X_col, bad = _encode_categorical_existing(X_col, nonmissings, categories)
return "ordinal", X_col, categories, bad
else:
msg = f"{processing} type invalid"
_log.error(msg)
raise TypeError(msg)
def _reshape_1D_if_possible(col):
if col.ndim != 1:
if col.ndim == 0:
# 0 dimensional items exist, but are weird/unexpected. len fails, shape is length 0.
return np.empty(0, col.dtype)
# ignore dimensions that have just 1 item and assume the intent was to give us 1D
is_found = False
for n_items in col.shape:
if 1 < n_items:
if is_found:
msg = f"Cannot reshape to 1D. Original shape was {col.shape}"
_log.error(msg)
raise ValueError(msg)
is_found = True
col = col.reshape(-1)
return col
def _process_numpy_column(X_col, categories, feature_type, min_unique_continuous):
nonmissings = None
if isinstance(X_col, ma.masked_array):
mask = X_col.mask
if mask is ma.nomask:
X_col = X_col.data
else:
X_col = X_col.compressed()
# it's legal for a mask to exist and yet have all valid entries in the mask, so check for this
if len(X_col) != len(mask):
nonmissings = ~mask
if X_col.dtype.type is np.object_:
if _pandas_installed:
# pandas also has the pd.NA value that indicates missing. If Pandas is available though
# we can use it's function that checks for pd.NA, np.nan, and None
nonmissings2 = pd.notna(X_col)
else:
# X_col == X_col is a check for nan that works even with mixed types, since nan != nan
nonmissings2 = np.logical_and(X_col != _none_ndarray, X_col == X_col)
if not nonmissings2.all():
X_col = X_col[nonmissings2]
if nonmissings is None:
nonmissings = nonmissings2
else:
# it's a little weird and possibly dangerous to place inside the array being read,
# but algorithmically this is the fastest thing to do, and it seems to work..
np.place(nonmissings, nonmissings, nonmissings2)
return _process_ndarray(
X_col, nonmissings, categories, feature_type, min_unique_continuous
)
def _process_pandas_column(X_col, categories, feature_type, min_unique_continuous):
if isinstance(X_col.dtype, np.dtype):
if (
issubclass(X_col.dtype.type, np.floating)
or issubclass(X_col.dtype.type, np.integer)
or X_col.dtype.type is np.bool_
):
X_col = X_col.values
return _process_ndarray(
X_col, None, categories, feature_type, min_unique_continuous
)
elif X_col.dtype.type is np.object_:
nonmissings = None
if X_col.hasnans:
# if hasnans is true then there is definetly a real missing value in there and not just a mask
nonmissings = X_col.notna().values
X_col = X_col.dropna()
X_col = X_col.values
return _process_ndarray(
X_col, nonmissings, categories, feature_type, min_unique_continuous
)
elif isinstance(X_col.dtype, pd.CategoricalDtype):
# unlike other missing value types, we get back -1's for missing here, so no need to drop them
X_col = X_col.values
is_ordered = X_col.ordered
pd_categories = X_col.categories.values.astype(dtype=np.str_, copy=False)
X_col = X_col.codes
if feature_type == "ignore":
pd_categories = pd_categories.astype(dtype=np.object_)
pd_categories = np.insert(pd_categories, 0, None)
bad = pd_categories[X_col + 1]
return None, None, bad, "ignore"
else:
if categories is None:
# called under: fit
X_col, categories = _encode_pandas_categorical_initial(
X_col, pd_categories, is_ordered, feature_type
)
bad = None
else:
# called under: predict
X_col, bad = _encode_pandas_categorical_existing(
X_col, pd_categories, categories
)
return "ordinal" if is_ordered else "nominal", X_col, categories, bad
elif issubclass(X_col.dtype.type, np.integer) or X_col.dtype.type is np.bool_:
# this handles Int8Dtype to Int64Dtype, UInt8Dtype to UInt64Dtype, and BooleanDtype
nonmissings = None
if X_col.hasnans:
# if hasnans is true then there is definetly a real missing value in there and not just a mask
nonmissings = X_col.notna().values
X_col = X_col.dropna()
X_col = X_col.values
X_col = X_col.astype(dtype=X_col.dtype.type, copy=False)
return _process_ndarray(
X_col, nonmissings, categories, feature_type, min_unique_continuous
)
# TODO: implement pd.SparseDtype
# TODO: implement pd.StringDtype both the numpy and arrow versions
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.StringDtype.html#pandas.StringDtype
msg = f"{type(X_col.dtype)} not supported"
_log.error(msg)
raise TypeError(msg)
def _process_scipy_column(X_col, categories, feature_type, min_unique_continuous):
X_col = X_col.toarray().reshape(-1)
nonmissings = None
if X_col.dtype.type is np.object_:
if _pandas_installed:
# pandas also has the pd.NA value that indicates missing. If Pandas is available though