/
statgen.py
3434 lines (2780 loc) · 142 KB
/
statgen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import itertools
import math
import numpy as np
from typing import *
import hail as hl
import hail.expr.aggregators as agg
from hail.expr.expressions import *
from hail.expr.types import *
from hail.ir import *
from hail.genetics.reference_genome import reference_genome_type
from hail.linalg import BlockMatrix
from hail.matrixtable import MatrixTable
from hail.methods.misc import require_biallelic, require_row_key_variant, require_col_key_str
from hail.stats import LinearMixedModel
from hail.table import Table
from hail.typecheck import *
from hail.utils import wrap_to_list, new_temp_file
from hail.utils.java import *
@typecheck(dataset=MatrixTable,
maf=nullable(expr_float64),
bounded=bool,
min=nullable(numeric),
max=nullable(numeric))
def identity_by_descent(dataset, maf=None, bounded=True, min=None, max=None) -> Table:
"""Compute matrix of identity-by-descent estimates.
.. include:: ../_templates/req_tstring.rst
.. include:: ../_templates/req_tvariant.rst
.. include:: ../_templates/req_biallelic.rst
Examples
--------
To calculate a full IBD matrix, using minor allele frequencies computed
from the dataset itself:
>>> hl.identity_by_descent(dataset)
To calculate an IBD matrix containing only pairs of samples with
``PI_HAT`` in :math:`[0.2, 0.9]`, using minor allele frequencies stored in
the row field `panel_maf`:
>>> hl.identity_by_descent(dataset, maf=dataset['panel_maf'], min=0.2, max=0.9)
Notes
-----
The dataset must have a column field named `s` which is a :class:`.StringExpression`
and which uniquely identifies a column.
The implementation is based on the IBD algorithm described in the `PLINK
paper <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC1950838>`__.
:func:`.identity_by_descent` requires the dataset to be biallelic and does
not perform LD pruning. Linkage disequilibrium may bias the result so
consider filtering variants first.
The resulting :class:`.Table` entries have the type: *{ i: String,
j: String, ibd: { Z0: Double, Z1: Double, Z2: Double, PI_HAT: Double },
ibs0: Long, ibs1: Long, ibs2: Long }*. The key list is: `*i: String, j:
String*`.
Conceptually, the output is a symmetric, sample-by-sample matrix. The
output table has the following form
.. code-block:: text
i j ibd.Z0 ibd.Z1 ibd.Z2 ibd.PI_HAT ibs0 ibs1 ibs2
sample1 sample2 1.0000 0.0000 0.0000 0.0000 ...
sample1 sample3 1.0000 0.0000 0.0000 0.0000 ...
sample1 sample4 0.6807 0.0000 0.3193 0.3193 ...
sample1 sample5 0.1966 0.0000 0.8034 0.8034 ...
Parameters
----------
dataset : :class:`.MatrixTable`
Variant-keyed and sample-keyed :class:`.MatrixTable` containing genotype information.
maf : :class:`.Float64Expression`, optional
Row-indexed expression for the minor allele frequency.
bounded : :obj:`bool`
Forces the estimations for `Z0``, ``Z1``, ``Z2``, and ``PI_HAT`` to take
on biologically meaningful values (in the range [0,1]).
min : :obj:`float` or :obj:`None`
Sample pairs with a ``PI_HAT`` below this value will
not be included in the output. Must be in :math:`[0,1]`.
max : :obj:`float` or :obj:`None`
Sample pairs with a ``PI_HAT`` above this value will
not be included in the output. Must be in :math:`[0,1]`.
Returns
-------
:class:`.Table`
"""
require_col_key_str(dataset, 'identity_by_descent')
if maf is not None:
analyze('identity_by_descent/maf', maf, dataset._row_indices)
dataset = dataset.select_rows(__maf = maf)
else:
dataset = dataset.select_rows()
dataset = dataset.select_cols().select_globals().select_entries('GT')
dataset = require_biallelic(dataset, 'ibd')
return Table(MatrixToTableApply(dataset._mir, {
'name': 'IBD',
'mafFieldName': '__maf' if maf is not None else None,
'bounded': bounded,
'min': min,
'max': max,
}))
@typecheck(call=expr_call,
aaf_threshold=numeric,
include_par=bool,
female_threshold=numeric,
male_threshold=numeric,
aaf=nullable(str))
def impute_sex(call, aaf_threshold=0.0, include_par=False, female_threshold=0.2, male_threshold=0.8, aaf=None) -> Table:
"""Impute sex of samples by calculating inbreeding coefficient on the X
chromosome.
.. include:: ../_templates/req_tvariant.rst
.. include:: ../_templates/req_biallelic.rst
Examples
--------
Remove samples where imputed sex does not equal reported sex:
>>> imputed_sex = hl.impute_sex(dataset.GT)
>>> dataset_result = dataset.filter_cols(imputed_sex[dataset.s].is_female != dataset.pheno.is_female,
... keep=False)
Notes
-----
We have used the same implementation as `PLINK v1.7
<http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#sexcheck>`__.
Let `gr` be the the reference genome of the type of the `locus` key (as
given by :meth:`.TLocus.reference_genome`)
1. Filter the dataset to loci on the X contig defined by `gr`.
2. Calculate alternate allele frequency (AAF) for each row from the dataset.
3. Filter to variants with AAF above `aaf_threshold`.
4. Remove loci in the pseudoautosomal region, as defined by `gr`, if and
only if `include_par` is ``True`` (it defaults to ``False``)
5. For each row and column with a non-missing genotype call, :math:`E`, the
expected number of homozygotes (from population AAF), is computed as
:math:`1.0 - (2.0*maf*(1.0-maf))`.
6. For each row and column with a non-missing genotype call, :math:`O`, the
observed number of homozygotes, is computed interpreting ``0`` as
heterozygote and ``1`` as homozygote`
7. For each row and column with a non-missing genotype call, :math:`N` is
incremented by 1
8. For each column, :math:`E`, :math:`O`, and :math:`N` are combined across
variants
9. For each column, :math:`F` is calculated by :math:`(O - E) / (N - E)`
10. A sex is assigned to each sample with the following criteria:
- Female when ``F < 0.2``
- Male when ``F > 0.8``
Use `female_threshold` and `male_threshold` to change this behavior.
**Annotations**
The returned column-key indexed :class:`.Table` has the following fields in
addition to the matrix table's column keys:
- **is_female** (:py:data:`.tbool`) -- True if the imputed sex is female,
false if male, missing if undetermined.
- **f_stat** (:py:data:`.tfloat64`) -- Inbreeding coefficient.
- **n_called** (:py:data:`.tint64`) -- Number of variants with a genotype call.
- **expected_homs** (:py:data:`.tfloat64`) -- Expected number of homozygotes.
- **observed_homs** (:py:data:`.tint64`) -- Observed number of homozygotes.
call : :class:`.CallExpression`
A genotype call for each row and column. The source dataset's row keys
must be [[locus], alleles] with types :class:`.tlocus` and
:class:`.ArrayStringExpression`. Moreover, the alleles array must have
exactly two elements (i.e. the variant must be biallelic).
aaf_threshold : :obj:`float`
Minimum alternate allele frequency threshold.
include_par : :obj:`bool`
Include pseudoautosomal regions.
female_threshold : :obj:`float`
Samples are called females if F < female_threshold.
male_threshold : :obj:`float`
Samples are called males if F > male_threshold.
aaf : :obj:`str` or :obj:`None`
A field defining the alternate allele frequency for each row. If
``None``, AAF will be computed from `call`.
Return
------
:class:`.Table`
Sex imputation statistics per sample.
"""
if aaf_threshold < 0.0 or aaf_threshold > 1.0:
raise FatalError("Invalid argument for `aaf_threshold`. Must be in range [0, 1].")
mt = call._indices.source
mt, _ = mt._process_joins(call)
mt = mt.annotate_entries(call=call)
mt = require_biallelic(mt, 'impute_sex')
if (aaf is None):
mt = mt.annotate_rows(aaf=agg.call_stats(mt.call, mt.alleles).AF[1])
aaf = 'aaf'
rg = mt.locus.dtype.reference_genome
mt = hl.filter_intervals(mt,
hl.map(lambda x_contig: hl.parse_locus_interval(x_contig, rg), rg.x_contigs),
keep=True)
if not include_par:
interval_type = hl.tarray(hl.tinterval(hl.tlocus(rg)))
mt = hl.filter_intervals(mt,
hl.literal(rg.par, interval_type),
keep=False)
mt = mt.filter_rows((mt[aaf] > aaf_threshold) & (mt[aaf] < (1 - aaf_threshold)))
mt = mt.annotate_cols(ib=agg.inbreeding(mt.call, mt[aaf]))
kt = mt.select_cols(
is_female=hl.cond(mt.ib.f_stat < female_threshold,
True,
hl.cond(mt.ib.f_stat > male_threshold,
False,
hl.null(tbool))),
**mt.ib).cols()
return kt
def _get_regression_row_fields(mt, pass_through, method) -> Dict[str, str]:
row_fields = dict(zip(mt.row_key.keys(), mt.row_key.keys()))
for f in pass_through:
if isinstance(f, str):
if f not in mt.row:
raise ValueError(f"'{method}/pass_through': MatrixTable has no row field {repr(f)}")
if f in row_fields:
# allow silent pass through of key fields
if f in mt.row_key:
pass
else:
raise ValueError(f"'{method}/pass_through': found duplicated field {repr(f)}")
row_fields[f] = mt[f]
else:
assert isinstance(f, Expression)
if not f._ir.is_nested_field:
raise ValueError(f"'{method}/pass_through': expect fields or nested fields, not complex expressions")
if not f._indices == mt._row_indices:
raise ExpressionException(f"'{method}/pass_through': require row-indexed fields, found indices {f._indices.axes}")
name = f._ir.name
if name in row_fields:
# allow silent pass through of key fields
if not (name in mt.row_key and f._ir == mt[name]._ir):
raise ValueError(f"'{method}/pass_through': found duplicated field {repr(name)}")
row_fields[name] = f
for k in mt.row_key:
del row_fields[k]
return row_fields
@typecheck(y=oneof(expr_float64, sequenceof(expr_float64), sequenceof(sequenceof(expr_float64))),
x=expr_float64,
covariates=sequenceof(expr_float64),
block_size=int,
pass_through=sequenceof(oneof(str, Expression)))
def linear_regression_rows(y, x, covariates, block_size=16, pass_through=()) -> hail.Table:
r"""For each row, test an input variable for association with
response variables using linear regression.
Examples
--------
>>> result_ht = hl.linear_regression_rows(
... y=dataset.pheno.height,
... x=dataset.GT.n_alt_alleles(),
... covariates=[1, dataset.pheno.age, dataset.pheno.is_female])
Warning
-------
As in the example, the intercept covariate ``1`` must be
included **explicitly** if desired.
Warning
-------
If `y` is a single value or a list, :func:`.linear_regression_rows`
considers the same set of columns (i.e., samples, points) for every response
variable and row, namely those columns for which **all** response variables
and covariates are defined.
If `y` is a list of lists, then each inner list is treated as an
independent group, subsetting columns for missingness separately.
Notes
-----
With the default root and `y` a single expression, the following row-indexed
fields are added.
- **<row key fields>** (Any) -- Row key fields.
- **<pass_through fields>** (Any) -- Row fields in `pass_through`.
- **n** (:py:data:`.tint32`) -- Number of columns used.
- **sum_x** (:py:data:`.tfloat64`) -- Sum of input values `x`.
- **y_transpose_x** (:py:data:`.tfloat64`) -- Dot product of response
vector `y` with the input vector `x`.
- **beta** (:py:data:`.tfloat64`) --
Fit effect coefficient of `x`, :math:`\hat\beta_1` below.
- **standard_error** (:py:data:`.tfloat64`) --
Estimated standard error, :math:`\widehat{\mathrm{se}}_1`.
- **t_stat** (:py:data:`.tfloat64`) -- :math:`t`-statistic, equal to
:math:`\hat\beta_1 / \widehat{\mathrm{se}}_1`.
- **p_value** (:py:data:`.tfloat64`) -- :math:`p`-value.
If `y` is a list of expressions, then the last five fields instead have type
:py:data:`.tarray` of :py:data:`.tfloat64`, with corresponding indexing of
the list and each array.
If `y` is a list of lists of expressions, then `n` and `sum_x` are of type
``array<float64>``, and the last five fields are of type
``array<array<float64>>``. Index into these arrays with
``a[index_in_outer_list, index_in_inner_list]``. For example, if
``y=[[a], [b, c]]`` then the p-value for ``b`` is ``p_value[1][0]``.
In the statistical genetics example above, the input variable `x` encodes
genotype as the number of alternate alleles (0, 1, or 2). For each variant
(row), genotype is tested for association with height controlling for age
and sex, by fitting the linear regression model:
.. math::
\mathrm{height} = \beta_0 + \beta_1 \, \mathrm{genotype}
+ \beta_2 \, \mathrm{age}
+ \beta_3 \, \mathrm{is\_female}
+ \varepsilon,
\quad
\varepsilon \sim \mathrm{N}(0, \sigma^2)
Boolean covariates like :math:`\mathrm{is\_female}` are encoded as 1 for
``True`` and 0 for ``False``. The null model sets :math:`\beta_1 = 0`.
The standard least-squares linear regression model is derived in Section
3.2 of `The Elements of Statistical Learning, 2nd Edition
<http://statweb.stanford.edu/~tibs/ElemStatLearn/printings/ESLII_print10.pdf>`__.
See equation 3.12 for the t-statistic which follows the t-distribution with
:math:`n - k - 1` degrees of freedom, under the null hypothesis of no
effect, with :math:`n` samples and :math:`k` covariates in addition to
``x``.
Note
----
Use the `pass_through` parameter to include additional row fields from
matrix table underlying ``x``. For example, to include an "rsid" field, set
``pass_through=['rsid']`` or ``pass_through=[mt.rsid]``.
Parameters
----------
y : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression`
One or more column-indexed response expressions.
x : :class:`.Float64Expression`
Entry-indexed expression for input variable.
covariates : :obj:`list` of :class:`.Float64Expression`
List of column-indexed covariate expressions.
block_size : :obj:`int`
Number of row regressions to perform simultaneously per core. Larger blocks
require more memory but may improve performance.
pass_through : :obj:`list` of :obj:`str` or :class:`.Expression`
Additional row fields to include in the resulting table.
Returns
-------
:class:`.Table`
"""
mt = matrix_table_source('linear_regression_rows/x', x)
check_entry_indexed('linear_regression_rows/x', x)
y_is_list = isinstance(y, list)
if y_is_list and len(y) == 0:
raise ValueError(f"'linear_regression_rows': found no values for 'y'")
is_chained = y_is_list and isinstance(y[0], list)
if is_chained and any(len(l) == 0 for l in y):
raise ValueError(f"'linear_regression_rows': found empty inner list for 'y'")
y = wrap_to_list(y)
for e in (itertools.chain.from_iterable(y) if is_chained else y):
analyze('linear_regression_rows/y', e, mt._col_indices)
for e in covariates:
analyze('linear_regression_rows/covariates', e, mt._col_indices)
_warn_if_no_intercept('linear_regression_rows', covariates)
x_field_name = Env.get_uid()
if is_chained:
y_field_names = [[f'__y_{i}_{j}' for j in range(len(y[i]))] for i in range(len(y))]
y_dict = dict(zip(itertools.chain.from_iterable(y_field_names), itertools.chain.from_iterable(y)))
func = 'LinearRegressionRowsChained'
else:
y_field_names = list(f'__y_{i}' for i in range(len(y)))
y_dict = dict(zip(y_field_names, y))
func = 'LinearRegressionRowsSingle'
cov_field_names = list(f'__cov{i}' for i in range(len(covariates)))
row_fields = _get_regression_row_fields(mt, pass_through, 'linear_regression_rows')
# FIXME: selecting an existing entry field should be emitted as a SelectFields
mt = mt._select_all(col_exprs=dict(**y_dict,
**dict(zip(cov_field_names, covariates))),
row_exprs=row_fields,
col_key=[],
entry_exprs={x_field_name: x})
config = {
'name': func,
'yFields': y_field_names,
'xField': x_field_name,
'covFields': cov_field_names,
'rowBlockSize': block_size,
'passThrough': [x for x in row_fields if x not in mt.row_key]
}
ht_result = Table(MatrixToTableApply(mt._mir, config))
if not y_is_list:
fields = ['y_transpose_x', 'beta', 'standard_error', 't_stat', 'p_value']
ht_result = ht_result.annotate(**{f: ht_result[f][0] for f in fields})
return ht_result.persist()
@typecheck(test=enumeration('wald', 'lrt', 'score', 'firth'),
y=oneof(expr_float64, sequenceof(expr_float64), sequenceof(sequenceof(expr_float64))),
x=expr_float64,
covariates=sequenceof(expr_float64),
pass_through=sequenceof(oneof(str, Expression)))
def logistic_regression_rows(test, y, x, covariates, pass_through=()) -> hail.Table:
r"""For each row, test an input variable for association with a
binary response variable using logistic regression.
Examples
--------
Run the logistic regression Wald test per variant using a Boolean
phenotype, intercept and two covariates stored in column-indexed
fields:
>>> result_ht = hl.logistic_regression_rows(
... test='wald',
... y=dataset.pheno.is_case,
... x=dataset.GT.n_alt_alleles(),
... covariates=[1, dataset.pheno.age, dataset.pheno.is_female])
Run the logistic regression Wald test per variant using a list of binary (0/1)
phenotypes, intercept and two covariates stored in column-indexed
fields:
>>> result_ht = hl.logistic_regression_rows(
... test='wald',
... y=[dataset.pheno.is_case, dataset.pheno.is_case], # where pheno values are 0, 1, or missing
... x=dataset.GT.n_alt_alleles(),
... covariates=[1, dataset.pheno.age, dataset.pheno.is_female])
Warning
-------
:func:`.logistic_regression_rows` considers the same set of
columns (i.e., samples, points) for every row, namely those columns for
which **all** response variables and covariates are defined. For each row, missing values of
`x` are mean-imputed over these columns. As in the example, the
intercept covariate ``1`` must be included **explicitly** if desired.
Notes
-----
This method performs, for each row, a significance test of the input
variable in predicting a binary (case-control) response variable based
on the logistic regression model. The response variable type must either
be numeric (with all present values 0 or 1) or Boolean, in which case
true and false are coded as 1 and 0, respectively.
Hail supports the Wald test ('wald'), likelihood ratio test ('lrt'),
Rao score test ('score'), and Firth test ('firth'). Hail only includes
columns for which the response variable and all covariates are defined.
For each row, Hail imputes missing input values as the mean of the
non-missing values.
The example above considers a model of the form
.. math::
\mathrm{Prob}(\mathrm{is\_case}) =
\mathrm{sigmoid}(\beta_0 + \beta_1 \, \mathrm{gt}
+ \beta_2 \, \mathrm{age}
+ \beta_3 \, \mathrm{is\_female} + \varepsilon),
\quad
\varepsilon \sim \mathrm{N}(0, \sigma^2)
where :math:`\mathrm{sigmoid}` is the `sigmoid function`_, the genotype
:math:`\mathrm{gt}` is coded as 0 for HomRef, 1 for Het, and 2 for
HomVar, and the Boolean covariate :math:`\mathrm{is\_female}` is coded as
for ``True`` (female) and 0 for ``False`` (male). The null model sets
:math:`\beta_1 = 0`.
.. _sigmoid function: https://en.wikipedia.org/wiki/Sigmoid_function
The structure of the emitted row field depends on the test statistic as
shown in the tables below.
========== ================== ======= ============================================
Test Field Type Value
========== ================== ======= ============================================
Wald `beta` float64 fit effect coefficient,
:math:`\hat\beta_1`
Wald `standard_error` float64 estimated standard error,
:math:`\widehat{\mathrm{se}}`
Wald `z_stat` float64 Wald :math:`z`-statistic, equal to
:math:`\hat\beta_1 / \widehat{\mathrm{se}}`
Wald `p_value` float64 Wald p-value testing :math:`\beta_1 = 0`
LRT, Firth `beta` float64 fit effect coefficient,
:math:`\hat\beta_1`
LRT, Firth `chi_sq_stat` float64 deviance statistic
LRT, Firth `p_value` float64 LRT / Firth p-value testing
:math:`\beta_1 = 0`
Score `chi_sq_stat` float64 score statistic
Score `p_value` float64 score p-value testing :math:`\beta_1 = 0`
========== ================== ======= ============================================
For the Wald and likelihood ratio tests, Hail fits the logistic model for
each row using Newton iteration and only emits the above fields
when the maximum likelihood estimate of the coefficients converges. The
Firth test uses a modified form of Newton iteration. To help diagnose
convergence issues, Hail also emits three fields which summarize the
iterative fitting process:
================ =================== ======= ===============================
Test Field Type Value
================ =================== ======= ===============================
Wald, LRT, Firth `fit.n_iterations` int32 number of iterations until
convergence, explosion, or
reaching the max (25 for
Wald, LRT; 100 for Firth)
Wald, LRT, Firth `fit.converged` bool ``True`` if iteration converged
Wald, LRT, Firth `fit.exploded` bool ``True`` if iteration exploded
================ =================== ======= ===============================
We consider iteration to have converged when every coordinate of
:math:`\beta` changes by less than :math:`10^{-6}`. For Wald and LRT,
up to 25 iterations are attempted; in testing we find 4 or 5 iterations
nearly always suffice. Convergence may also fail due to explosion,
which refers to low-level numerical linear algebra exceptions caused by
manipulating ill-conditioned matrices. Explosion may result from (nearly)
linearly dependent covariates or complete separation_.
.. _separation: https://en.wikipedia.org/wiki/Separation_(statistics)
A more common situation in genetics is quasi-complete seperation, e.g.
variants that are observed only in cases (or controls). Such variants
inevitably arise when testing millions of variants with very low minor
allele count. The maximum likelihood estimate of :math:`\beta` under
logistic regression is then undefined but convergence may still occur
after a large number of iterations due to a very flat likelihood
surface. In testing, we find that such variants produce a secondary bump
from 10 to 15 iterations in the histogram of number of iterations per
variant. We also find that this faux convergence produces large standard
errors and large (insignificant) p-values. To not miss such variants,
consider using Firth logistic regression, linear regression, or
group-based tests.
Here's a concrete illustration of quasi-complete seperation in R. Suppose
we have 2010 samples distributed as follows for a particular variant:
======= ====== === ======
Status HomRef Het HomVar
======= ====== === ======
Case 1000 10 0
Control 1000 0 0
======= ====== === ======
The following R code fits the (standard) logistic, Firth logistic,
and linear regression models to this data, where ``x`` is genotype,
``y`` is phenotype, and ``logistf`` is from the logistf package:
.. code-block:: R
x <- c(rep(0,1000), rep(1,1000), rep(1,10)
y <- c(rep(0,1000), rep(0,1000), rep(1,10))
logfit <- glm(y ~ x, family=binomial())
firthfit <- logistf(y ~ x)
linfit <- lm(y ~ x)
The resulting p-values for the genotype coefficient are 0.991, 0.00085,
and 0.0016, respectively. The erroneous value 0.991 is due to
quasi-complete separation. Moving one of the 10 hets from case to control
eliminates this quasi-complete separation; the p-values from R are then
0.0373, 0.0111, and 0.0116, respectively, as expected for a less
significant association.
The Firth test reduces bias from small counts and resolves the issue of
separation by penalizing maximum likelihood estimation by the `Jeffrey's
invariant prior <https://en.wikipedia.org/wiki/Jeffreys_prior>`__. This
test is slower, as both the null and full model must be fit per variant,
and convergence of the modified Newton method is linear rather than
quadratic. For Firth, 100 iterations are attempted for the null model
and, if that is successful, for the full model as well. In testing we
find 20 iterations nearly always suffices. If the null model fails to
converge, then the `logreg.fit` fields reflect the null model;
otherwise, they reflect the full model.
See
`Recommended joint and meta-analysis strategies for case-control association testing of single low-count variants <http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4049324/>`__
for an empirical comparison of the logistic Wald, LRT, score, and Firth
tests. The theoretical foundations of the Wald, likelihood ratio, and score
tests may be found in Chapter 3 of Gesine Reinert's notes
`Statistical Theory <http://www.stats.ox.ac.uk/~reinert/stattheory/theoryshort09.pdf>`__.
Firth introduced his approach in
`Bias reduction of maximum likelihood estimates, 1993 <http://www2.stat.duke.edu/~scs/Courses/Stat376/Papers/GibbsFieldEst/BiasReductionMLE.pdf>`__.
Heinze and Schemper further analyze Firth's approach in
`A solution to the problem of separation in logistic regression, 2002 <https://cemsiis.meduniwien.ac.at/fileadmin/msi_akim/CeMSIIS/KB/volltexte/Heinze_Schemper_2002_Statistics_in_Medicine.pdf>`__.
Hail's logistic regression tests correspond to the ``b.wald``,
``b.lrt``, and ``b.score`` tests in `EPACTS`_. For each variant, Hail
imputes missing input values as the mean of non-missing input values,
whereas EPACTS subsets to those samples with called genotypes. Hence,
Hail and EPACTS results will currently only agree for variants with no
missing genotypes.
.. _EPACTS: http://genome.sph.umich.edu/wiki/EPACTS#Single_Variant_Tests
Note
----
Use the `pass_through` parameter to include additional row fields from
matrix table underlying ``x``. For example, to include an "rsid" field, set
``pass_through=['rsid']`` or ``pass_through=[mt.rsid]``.
Parameters
----------
test : {'wald', 'lrt', 'score', 'firth'}
Statistical test.
y : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression`
One or more column-indexed response expressions.
All non-missing values must evaluate to 0 or 1.
Note that a :class:`.BooleanExpression` will be implicitly converted to
a :class:`.Float64Expression` with this property.
x : :class:`.Float64Expression`
Entry-indexed expression for input variable.
covariates : :obj:`list` of :class:`.Float64Expression`
Non-empty list of column-indexed covariate expressions.
pass_through : :obj:`list` of :obj:`str` or :class:`.Expression`
Additional row fields to include in the resulting table.
Returns
-------
:class:`.Table`
"""
if len(covariates) == 0:
raise ValueError('logistic regression requires at least one covariate expression')
mt = matrix_table_source('logistic_regresion_rows/x', x)
check_entry_indexed('logistic_regresion_rows/x', x)
y_is_list = isinstance(y, list)
if y_is_list and len(y) == 0:
raise ValueError(f"'logistic_regression_rows': found no values for 'y'")
y = wrap_to_list(y)
for e in covariates:
analyze('logistic_regression_rows/covariates', e, mt._col_indices)
_warn_if_no_intercept('logistic_regression_rows', covariates)
x_field_name = Env.get_uid()
y_field = [f'__y_{i}' for i in range(len(y))]
y_dict = dict(zip(y_field, y))
cov_field_names = [f'__cov{i}' for i in range(len(covariates))]
row_fields = _get_regression_row_fields(mt, pass_through, 'logistic_regression_rows')
# FIXME: selecting an existing entry field should be emitted as a SelectFields
mt = mt._select_all(col_exprs=dict(**y_dict,
**dict(zip(cov_field_names, covariates))),
row_exprs=row_fields,
col_key=[],
entry_exprs={x_field_name: x})
config = {
'name': 'LogisticRegression',
'test': test,
'yFields': y_field,
'xField': x_field_name,
'covFields': cov_field_names,
'passThrough': [x for x in row_fields if x not in mt.row_key]
}
result = Table(MatrixToTableApply(mt._mir, config))
if not y_is_list:
result = result.transmute(**result.logistic_regression[0])
return result.persist()
@typecheck(test=enumeration('wald', 'lrt', 'score'),
y=expr_float64,
x=expr_float64,
covariates=sequenceof(expr_float64),
pass_through=sequenceof(oneof(str, Expression)))
def poisson_regression_rows(test, y, x, covariates, pass_through=()) -> Table:
r"""For each row, test an input variable for association with a
count response variable using `Poisson regression <https://en.wikipedia.org/wiki/Poisson_regression>`__.
Notes
-----
See :func:`.logistic_regression_rows` for more info on statistical tests
of general linear models.
Note
----
Use the `pass_through` parameter to include additional row fields from
matrix table underlying ``x``. For example, to include an "rsid" field, set
``pass_through=['rsid']`` or ``pass_through=[mt.rsid]``.
Parameters
----------
y : :class:`.Float64Expression`
Column-indexed response expression.
All non-missing values must evaluate to a non-negative integer.
x : :class:`.Float64Expression`
Entry-indexed expression for input variable.
covariates : :obj:`list` of :class:`.Float64Expression`
Non-empty list of column-indexed covariate expressions.
pass_through : :obj:`list` of :obj:`str` or :class:`.Expression`
Additional row fields to include in the resulting table.
Returns
-------
:class:`.Table`
"""
if len(covariates) == 0:
raise ValueError('Poisson regression requires at least one covariate expression')
mt = matrix_table_source('poisson_regression_rows/x', x)
check_entry_indexed('poisson_regression_rows/x', x)
analyze('poisson_regression_rows/y', y, mt._col_indices)
all_exprs = [y]
for e in covariates:
all_exprs.append(e)
analyze('poisson_regression_rows/covariates', e, mt._col_indices)
_warn_if_no_intercept('poisson_regression_rows', covariates)
x_field_name = Env.get_uid()
y_field_name = '__y'
cov_field_names = list(f'__cov{i}' for i in range(len(covariates)))
row_fields = _get_regression_row_fields(mt, pass_through, 'poisson_regression_rows')
# FIXME: selecting an existing entry field should be emitted as a SelectFields
mt = mt._select_all(col_exprs=dict(**{y_field_name: y},
**dict(zip(cov_field_names, covariates))),
row_exprs=row_fields,
col_key=[],
entry_exprs={x_field_name: x})
config = {
'name': 'PoissonRegression',
'test': test,
'yField': y_field_name,
'xField': x_field_name,
'covFields': cov_field_names,
'passThrough': [x for x in row_fields if x not in mt.row_key]
}
return Table(MatrixToTableApply(mt._mir, config)).persist()
@typecheck(y=expr_float64,
x=sequenceof(expr_float64),
z_t=nullable(expr_float64),
k=nullable(np.ndarray),
p_path=nullable(str),
overwrite=bool,
standardize=bool,
mean_impute=bool)
def linear_mixed_model(y,
x,
z_t=None,
k=None,
p_path=None,
overwrite=False,
standardize=True,
mean_impute=True):
r"""Initialize a linear mixed model from a matrix table.
Examples
--------
Initialize a model using three fixed effects (including intercept) and
genetic marker random effects:
>>> marker_ds = dataset.filter_rows(dataset.use_as_marker)
>>> model, _ = hl.linear_mixed_model(
... y=marker_ds.pheno.height,
... x=[1, marker_ds.pheno.age, marker_ds.pheno.is_female],
... z_t=marker_ds.GT.n_alt_alleles(),
... p_path='output/p.bm')
Fit the model and examine :math:`h^2`:
>>> model.fit()
>>> model.h_sq
Sanity-check the normalized likelihood of :math:`h^2` over the percentile
grid:
>>> import matplotlib.pyplot as plt # doctest: +SKIP
>>> plt.plot(range(101), model.h_sq_normalized_lkhd()) # doctest: +SKIP
For this value of :math:`h^2`, test each variant for association:
>>> result_table = hl.linear_mixed_regression_rows(dataset.GT.n_alt_alleles(), model)
Alternatively, one can define a full-rank model using a pre-computed kinship
matrix :math:`K` in ndarray form. When :math:`K` is the realized
relationship matrix defined by the genetic markers, we obtain the same model
as above with :math:`P` written as a block matrix but returned as an
ndarray:
>>> rrm = hl.realized_relationship_matrix(marker_ds.GT).to_numpy()
>>> model, p = hl.linear_mixed_model(
... y=dataset.pheno.height,
... x=[1, dataset.pheno.age, dataset.pheno.is_female],
... k=rrm,
... p_path='output/p.bm',
... overwrite=True)
Notes
-----
See :class:`.LinearMixedModel` for details on the model and notation.
Exactly one of `z_t` and `k` must be set.
If `z_t` is set, the model is low-rank if the number of samples :math:`n` exceeds
the number of random effects :math:`m`. At least one dimension must be less
than or equal to 46300. If `standardize` is true, each random effect is first
standardized to have mean 0 and variance :math:`\frac{1}{m}`, so that the
diagonal values of the kinship matrix `K = ZZ^T` are 1.0 in expectation.
This kinship matrix corresponds to the :meth:`realized_relationship_matrix`
in genetics. See :meth:`.LinearMixedModel.from_random_effects`
and :meth:`.BlockMatrix.svd` for more details.
If `k` is set, the model is full-rank. For correct results, the indices of
`k` **must be aligned** with columns of the source of `y`.
Set `p_path` if you plan to use the model in :meth:`.linear_mixed_regression_rows`.
`k` must be positive semi-definite; symmetry is not checked as only the
lower triangle is used. See :meth:`.LinearMixedModel.from_kinship` for more
details.
Missing, nan, or infinite values in `y` or `x` will raise an error.
If set, `z_t` may only have missing values if `mean_impute` is true, in
which case missing values of are set to the row mean. We recommend setting
`mean_impute` to false if you expect no missing values, both for performance
and as a sanity check.
Warning
-------
If the rows of the matrix table have been filtered to a small fraction,
then :meth:`.MatrixTable.repartition` before this method to improve
performance.
Parameters
----------
y: :class:`.Float64Expression`
Column-indexed expression for the observations (rows of :math:`y`).
Must have no missing values.
x: :obj:`list` of :class:`.Float64Expression`
Non-empty list of column-indexed expressions for the fixed effects (rows of :math:`X`).
Each expression must have the same source as `y` or no source
(e.g., the intercept ``1.0``).
Must have no missing values.
z_t: :class:`.Float64Expression`, optional
Entry-indexed expression for each mixed effect. These values are
row-standardized to variance :math:`1 / m` to form the entries of
:math:`Z^T`. If `mean_impute` is false, must have no missing values.
Exactly one of `z_t` and `k` must be set.
k: :class:`ndarray`, optional
Kinship matrix :math:`K`.
Exactly one of `z_t` and `k` must be set.
p_path: :obj:`str`, optional
Path at which to write the projection :math:`P` as a block matrix.
Required if `z_t` is set.
overwrite: :obj:`bool`
If ``True``, overwrite an existing file at `p_path`.
standardize: :obj:`bool`
If ``True``, standardize `z_t` by row to mean 0 and variance
:math:`\frac{1}{m}`.
mean_impute: :obj:`bool`
If ``True``, mean-impute missing values of `z_t` by row.
Returns
-------
model: :class:`.LinearMixedModel`
Linear mixed model ready to be fit.
p: :class:`ndarray` or :class:`.BlockMatrix`
Matrix :math:`P` whose rows are the eigenvectors of :math:`K`.
The type is block matrix if the model is low rank (i.e., if `z_t` is set
and :math:`n > m`).
"""
source = matrix_table_source('linear_mixed_model/y', y)
if ((z_t is None and k is None) or
(z_t is not None and k is not None)):
raise ValueError("linear_mixed_model: set exactly one of 'z_t' and 'k'")
if len(x) == 0:
raise ValueError("linear_mixed_model: 'x' must include at least one fixed effect")
_warn_if_no_intercept('linear_mixed_model', x)
# collect x and y in one pass
mt = source.select_cols(xy=hl.array(x + [y])).key_cols_by()
xy = np.array(mt.xy.collect(), dtype=np.float64)
xy = xy.reshape(xy.size // (len(x) + 1), len(x) + 1)
x_nd = np.copy(xy[:, :-1])
y_nd = np.copy(xy[:, -1])
n = y_nd.size
del xy
if not np.all(np.isfinite(y_nd)):
raise ValueError("linear_mixed_model: 'y' has missing, nan, or infinite values")
if not np.all(np.isfinite(x_nd)):
raise ValueError("linear_mixed_model: 'x' has missing, nan, or infinite values")
if z_t is None:
model, p = LinearMixedModel.from_kinship(y_nd, x_nd, k, p_path, overwrite)
else:
check_entry_indexed('from_matrix_table: z_t', z_t)
if matrix_table_source('linear_mixed_model/z_t', z_t) != source:
raise ValueError("linear_mixed_model: 'y' and 'z_t' must "
"have the same source")
z_bm = BlockMatrix.from_entry_expr(z_t,
mean_impute=mean_impute,
center=standardize,
normalize=standardize).T # variance is 1 / n
m = z_bm.shape[1]
model, p = LinearMixedModel.from_random_effects(y_nd, x_nd, z_bm, p_path, overwrite)
if standardize:
model.s = model.s * (n / m) # now variance is 1 / m
if model.low_rank and isinstance(p, np.ndarray):
assert n > m
p = BlockMatrix.read(p_path)
return model, p
@typecheck(entry_expr=expr_float64,
model=LinearMixedModel,
pa_t_path=nullable(str),
a_t_path=nullable(str),
mean_impute=bool,
partition_size=nullable(int),
pass_through=sequenceof(oneof(str, Expression)))
def linear_mixed_regression_rows(entry_expr,
model,
pa_t_path=None,
a_t_path=None,
mean_impute=True,
partition_size=None,
pass_through=()):
"""For each row, test an input variable for association using a linear
mixed model.
Examples
--------
See the example in :meth:`linear_mixed_model` and section below on
efficiently testing multiple responses or sets of fixed effects.
Notes
-----
See :class:`.LinearMixedModel` for details on the model and notation.
This method packages up several steps for convenience:
1. Read the transformation :math:`P` from ``model.p_path``.
2. Write `entry_expr` at `a_t_path` as the block matrix :math:`A^T` with