In [5]:
import numpy as np
import hail as hl
from hail import methods
import scipy as sp
import pandas as pd
from math import sqrt, pi

## CREATE GENETIC DATA (and clean/process/edit)

In [7]:
# Create genetic data and write to disk
bnm_mt = hl.balding_nichols_model(3, 100, 1000)
bnm_mt.write("balding_nichols_3_100_1000.mt")

2020-07-20 11:34:17 Hail: INFO: balding_nichols_model: generating genotypes for 3 populations, 100 samples, and 1000 variants...
2020-07-20 11:34:18 Hail: INFO: Coerced sorted dataset
2020-07-20 11:34:21 Hail: INFO: wrote matrix table with 1000 rows and 100 columns in 8 partitions to balding_nichols_3_100_1000.mt


In [8]:
# Read first MatrixTable and clean

# entries are now calls: An object that represents an individual’s call at a genomic locus
mt = hl.read_matrix_table("balding_nichols_3_100_1000.mt")

# don't understand meaning of this: returns the count of non-reference alleles from each call
mt = mt.transmute_entries(n_alt = hl.float64(mt.GT.n_alt_alleles())) 

mt.describe()

----------------------------------------
Global fields:
    'bn': struct {
        n_populations: int32, 
        n_samples: int32, 
        n_variants: int32, 
        n_partitions: int32, 
        pop_dist: array<int32>, 
        fst: array<float64>, 
        mixture: bool
    }
----------------------------------------
Column fields:
    'sample_idx': int32
    'pop': int32
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'ancestral_af': float64
    'af': array<float64>
----------------------------------------
Entry fields:
    'n_alt': float64
----------------------------------------
Column key: ['sample_idx']
Row key: ['locus', 'alleles']
----------------------------------------


In [9]:
# Turn MatrixTable into Table

ht = mt.localize_entries("ent", "sample")
ht.describe()

----------------------------------------
Global fields:
    'bn': struct {
        n_populations: int32, 
        n_samples: int32, 
        n_variants: int32, 
        n_partitions: int32, 
        pop_dist: array<int32>, 
        fst: array<float64>, 
        mixture: bool
    } 
    'sample': array<struct {
        sample_idx: int32, 
        pop: int32
    }> 
----------------------------------------
Row fields:
    'locus': locus<GRCh37> 
    'alleles': array<str> 
    'ancestral_af': float64 
    'af': array<float64> 
    'ent': array<struct {
        n_alt: float64
    }> 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


2020-07-20 11:34:26 Hail: WARN: Name collision: field 'sample' already in object dict. 
  This field must be referenced with __getitem__ syntax: obj['sample']


## Grouping and NDArray methods from Tim and Dan

In [10]:
# Functions for operating with Tables of ndarrays in Hail (from Tim)

from hail.expr import Expression, ExpressionException, \
    expr_float64, expr_call, expr_any, expr_numeric, expr_array, \
    expr_locus, \
    analyze, check_entry_indexed, check_row_indexed, \
    matrix_table_source, table_source

# Only groups by rows, NOT COLUMNS
def matrix_table_to_table_of_ndarrays(field, group_size, tmp_path = '/tmp/nd_table.ht'):
    """

    The returned table has two fields: 'row_group_number' and 'ndarray'.

    Examples
    --------
    >>> ht = matrix_table_to_table_of_ndarrays(mt.GT.n_alt_alleles(), 100)

    Parameters
    ----------
    field
    group_size
    tmp_path

    Returns
    -------

    """
    mt = matrix_table_source('matrix_table_to_table_of_ndarrays/x', field)
    mt = mt.select_entries(x = field)
    ht = mt.localize_entries(entries_array_field_name='entries')
    # now ht.entries is an array of structs with one field, x

    # we'll also want to mean-impute/variance-normalize/etc here
    ht = ht.select(xs = ht.entries.map(lambda e: e['x']))
    # now ht.xs is an array of float64

    # now need to produce groups of G
    ht = ht.add_index()
    ht = ht.group_by(row_group_number=ht.idx // group_size) \
        .aggregate(ndarray=hl.nd.array(hl.agg.collect(ht.xs)))
    # may require a .T on ndarray

    return ht.checkpoint(tmp_path, overwrite=True)

def chunk_ndarray(a, group_size):
    """Chunks a NDarray along the first axis in chunks of `group_size`.
    Parameters
    ----------
    a
    group_size

    Returns
    -------

    """
    n_groups = a.shape[0] // group_size
    groups = []
    for i in range(a.shape[0] // group_size):
        start = i * group_size
        end = (i + 1) * group_size
        groups.append(a[start:end, :])
    return groups

In [11]:
# function to multiply two blocks, given the two blocks
# returns struct in form of array but not ndarray? don't understand last line
def block_product(left, right):
    product = left @ right
    n_rows, n_cols = product.shape
    return hl.struct(
        shape=product.shape,
        block=hl.range(hl.int(n_rows * n_cols)).map(
            lambda absolute: product[absolute % n_rows, absolute // n_rows]))

def block_aggregate(prod):
    shape = prod.shape
    block = prod.block
    return hl.nd.from_column_major(
        hl.agg.array_sum(block),
        hl.agg.take(shape, 1)[0])

def to_column_major(ndarray):
    n_rows, n_cols = ndarray.shape
    return hl.range(hl.int(n_rows * n_cols)).map(
        lambda absolute: ndarray[absolute % n_rows, absolute // n_rows])

# hl.nd.from_column_major(thing.the_sum, thing.the_shape)

In [12]:
new_ht = matrix_table_to_table_of_ndarrays(mt.n_alt, 10)

2020-07-20 11:34:32 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-20 11:34:33 Hail: INFO: wrote table with 100 rows in 8 partitions to /tmp/nd_table.ht


In [13]:
new_ht.describe()

----------------------------------------
Global fields:
    'bn': struct {
        n_populations: int32, 
        n_samples: int32, 
        n_variants: int32, 
        n_partitions: int32, 
        pop_dist: array<int32>, 
        fst: array<float64>, 
        mixture: bool
    } 
----------------------------------------
Row fields:
    'row_group_number': int64 
    'ndarray': ndarray<float64, 2> 
----------------------------------------
Key: ['row_group_number']
----------------------------------------


## Blanczos Algorithm

In [14]:
def makeData(model_input, group_size):
    hold_mt = hl.balding_nichols_model(*model_input)
    hold_mt.write("balding_nichols_test.mt")
    hold_mt = hl.read_matrix_table("balding_nichols_test.mt")
    hold_mt = hold_mt.transmute_entries(n_alt = hl.float64(hold_mt.GT.n_alt_alleles())) 
    table = hold_mt.localize_entries("ent", "sample")
    return matrix_table_to_table_of_ndarrays(hold_mt.n_alt, group_size, tmp_path='/tmp/test_table.ht')
    
data = makeData((3, 100, 1000), 4)


2020-07-20 11:34:33 Hail: INFO: balding_nichols_model: generating genotypes for 3 populations, 100 samples, and 1000 variants...
2020-07-20 11:34:34 Hail: INFO: Coerced sorted dataset
2020-07-20 11:34:36 Hail: INFO: wrote matrix table with 1000 rows and 100 columns in 8 partitions to balding_nichols_test.mt
2020-07-20 11:34:38 Hail: INFO: Ordering unsorted dataset with network shuffle
2020-07-20 11:34:40 Hail: INFO: wrote table with 250 rows in 8 partitions to /tmp/test_table.ht


In [15]:
(n, m) = (100, 1000)
k = 50
l = k + 2
q = 0

In [16]:
G = hl.nd.array(np.random.normal(0, 1, (n,l)))

In [18]:
# WANT TO DO H0 = A @ G
# Multiply a row-blocked matrix by a local non-blocked matrix
# First step of algorithm

temp_matrix = data.annotate_globals(G = G)
# do matrix multiplication by converting from ndarray to array
AG = temp_matrix.annotate(prod = block_product(temp_matrix.ndarray, temp_matrix.G))
# convert back to ndarray
AG = AG.annotate(ndarray = hl.nd.from_column_major(AG.prod.block, hl.agg.take(AG.prod.shape, 1)[0] ))

#AG = AG.group_by(AG.prod).aggregate(result = block_aggregate(AG.prod)) # this actually sums all the blocks up


# drop extra information
AG = AG.select(AG.ndarray)
H0 = AG.drop(AG.G)
# final_result.describe()
# final_result.show() # NOTE: THIS GIVES KeyError 'row' when called - is this from John's ndarray show bug??

# assumes blocks in blocked matrix are named ndarray
def matmul_rowblocked_nonblocked(A, B):
    temp = A.annotate_globals(mat = B)
    temp = temp.annotate(prod = block_product(temp.ndarray, temp.mat))
    temp = temp.annotate(ndarray = hl.nd.from_column_major(temp.prod.block, hl.agg.take(temp.prod.shape, 1)[0]))
    temp = temp.select(temp.ndarray)
    temp = temp.drop(temp.mat)
    return temp

# H0 = matmul_rowblocked_nonblocked(data, G)
# H0.collect()

In [19]:
# WANT TO DO intermediate step At @ (A @ G) = At @ H0
# Multiply a column-blocked matrix by a row-blocked matrix 
# as a blockmatrix multiplcation and then sum
# Second step of algorithm

temp = data.annotate_globals(mat = H0)
temp = temp.annotate(prod = block_product(temp.ndarray.transpose(), temp.mat))
temp = temp.group_by(temp.prod).aggregate(ndarray = block_aggregate(temp.prod))
temp = temp.select(temp.ndarray)
temp = temp.drop(temp.mat)


# pass in matrix A normally, blocked in rows - this specifically expects A to need to be transposed
# assumes blocks in blocked matrix are named ndarray
def matmul_colblocked_rowblocked(A, B):
    temp = A.annotate_globals(mat = B)
    temp = temp.annotate(prod = block_product(temp.ndarray.transpose(), temp.mat))
    temp = temp.group_by(temp.prod).aggregate(ndarray = block_aggregate(temp.prod))
    temp = temp.select(temp.ndarray)
    temp = temp.drop(temp.mat)
    return temp

#G1 = matmul_colblocked_rowblocked(data, H0)

TypeError: annotate_globals: keyword argument 'mat': expected expression of type any, found hail.table.Table: <hail.table.Table object at 0x7fd56fd7f590>

In [None]:
data.describe()
G.describe()
#ht.transmute(ent = ht.ent.map(lambda x: x.n_alt))
AG = data.annotate_globals(G = G)
AG.describe()
AG = AG.annotate(prod = (AG.ndarray @ AG.G))


AG.describe()
AG.show()
matmul_product = AG.aggregate(hl.agg.array_agg(lambda element: hl.agg.sum(element), AG.prod))

#(hl.agg.array_sum([data.ndarray @ G]))

## Practice Scraps - linalg operations on small data, experimenting with Hail

In [None]:
# annotate_{rows, cols, entries}
# can annotate without aggregating but can also do an aggregation that is called an annotation?

# mt.annotate_rows(sum_of_ef1_by_row=hl.agg.sum(mt.ef1))
# Aggregate along each row of entries to create a new row annotation. Can
# reference column and entry fields in aggregations.

# mt.annotate_cols(sum_of_ef1_by_col=hl.agg.sum(mt.ef1))
# Aggregate along each column of entries to create a new col annotation.
# Can reference row and entry fields in aggregations

# need map-like aggregator to create "new" MatrixTable that is a product

In [None]:
# Practice doing a matrix multiplication and a transpose

# Make some python ndarrays
a = np.arange(30).reshape((5, 6))
b = np.arange(24).reshape((6, 4))

# Make Hail ndarrays
matrix_5_6 = hl.nd.array(a)
matrix_6_4 = hl.nd.array(b)

# Make MatrixTables from pandas dataframes
dfA = pd.DataFrame(data=a[1:,1:], index=a[1:,0], columns=a[0,1:])
dfB = pd.DataFrame(data=b[1:,1:], index=b[1:,0], columns=b[0,1:])

# How do these hail ndarrys carry across into MatrixTable use?

In [None]:
hl.eval(matrix_5_6) 

In [None]:
# dfA
# tableA = hl.Table.from_pandas(dfA)
# tableA.show()
# mtA = tableA.to_matrix_table(row_key=['1'], col_key=['2'])
# mtA.describe()
# new_htA = matrix_table_to_table_of_ndarrays(mtA.1, 2, PATH???)
# tableA.group_by(row_group_number=tableA.idx // 2)