In [185]:
import numpy as np
import pandas as pd
import scipy.linalg

In [186]:
def create_diagonal_mask(low_to_high_map, target_value = 1):
    
    """Create a block diagonal mask matrix from the input mapping.

    The input pandas data frame has only two columns, the first is the 
    low level id (image, sample, or probe_id) and the second is the 
    high level mapping (gene, region, donor). The target_value argument can
    be set to np.nan. 
    
    The output will be a matrix sized the number of low level ID's squared. 
    The column and row order will have to be rearranged to match your distance matrix.
    
       """
    low_to_high_map.drop_duplicates()
    grouped = low_to_high_map.groupby(low_to_high_map.columns[1])
    ordered_low_level_names = list()
    current_diagonal_location = 0
    group_matrices = []
    for name, group in grouped:
        group_size = group.shape[0]
        #build up row/col names, order doesn't matter within a group = they are all equal
        ordered_low_level_names = ordered_low_level_names + group.iloc[:,0].tolist()
        #set the diagonal matrix to be the target value
        single_group_matrix = np.full(shape = (group_size,group_size), fill_value = target_value)
        group_matrices.append(single_group_matrix)
    #add the individual matrices along the diagonal
    relationship_matrix = scipy.linalg.block_diag(*group_matrices)
    #convert to pandas dataframe and set names
    relationship_df = pd.DataFrame(relationship_matrix, columns = ordered_low_level_names, index = ordered_low_level_names)
    return relationship_df

In [192]:
data = [['probe_a', 'gene_1'], ['probe_b', 'gene_1'], ['probe_c', 'gene_2'], ['probe_d', 'gene_3'], ['probe_e', 'gene_3']]
  
# Create the pandas DataFrame 
df = pd.DataFrame(data) 
df

Unnamed: 0,0,1
0,probe_a,gene_1
1,probe_b,gene_1
2,probe_c,gene_2
3,probe_d,gene_3
4,probe_e,gene_3


In [198]:
create_diagonal_mask(df, target_value = 1)

Unnamed: 0,probe_a,probe_b,probe_c,probe_d,probe_e
probe_a,1,1,0,0,0
probe_b,1,1,0,0,0
probe_c,0,0,1,0,0
probe_d,0,0,0,1,1
probe_e,0,0,0,1,1


In [214]:
#file from Derek, runs in seconds
low_to_high_map = pd.read_csv("/Users/lfrench/Downloads/probe_to_genes.csv")

In [215]:
low_to_high_map.shape

(53145, 2)

In [217]:
mask = create_diagonal_mask(low_to_high_map, target_value = 1)
#don't forget to check your row/column orders (rows are set as index currently)

In [220]:
mask.iloc[1:10, 1:10]

Unnamed: 0,CUST_9126_PI416261804,A_24_P151121,A_24_P721699,A_23_P104224,CUST_16906_PI416261804,A_24_P570378,A_23_P116898,CUST_13981_PI416261804,CUST_50_PI416408490
CUST_9126_PI416261804,1,1,0,0,0,0,0,0,0
A_24_P151121,1,1,0,0,0,0,0,0,0
A_24_P721699,0,0,1,0,0,0,0,0,0
A_23_P104224,0,0,0,1,1,1,0,0,0
CUST_16906_PI416261804,0,0,0,1,1,1,0,0,0
A_24_P570378,0,0,0,1,1,1,0,0,0
A_23_P116898,0,0,0,0,0,0,1,1,1
CUST_13981_PI416261804,0,0,0,0,0,0,1,1,1
CUST_50_PI416408490,0,0,0,0,0,0,1,1,1
