Here we need to organize the Framingham data in desired format and structure. 

* Expression data: convert the sample ID to subject ID so that it matches to the other data sets
* Extract child-parent structure from the data.
* Split the expression data into father matrix and mother matrix.

In [1]:
expression_file = '/lambda_stor/data/yanyul/Framingham/apt-gene/rma-sketch.summary.txt'
pedigree_file = '/lambda_stor/data/yanyul/Framingham/40031/PhenoGenotypeFiles/RootStudyConsentSet_phs000007.Framingham.v23.p8.c1.HMB-IRB-MDS/PhenotypeFiles/phs000007.v23.pht000183.v10.p8.Framingham_Pedigree.MULTI.txt.gz'
map_sample2subject = '/lambda_stor/data/yanyul/Framingham/43832/PhenoGenotypeFiles/ChildStudyConsentSet_phs000363.Framingham.v12.p9.c2.HMB-IRB-NPU-MDS/ExpressionFiles/phe000002.v5.FHS_SABRe_project3.sample-info.MULTI/phe000002.v5_release_manifest.txt'
genotype_file = '/lambda_stor/data/yanyul/Framingham/imputed_hrc1.1/chr1.dose.vcf.gz'
gene_annot_file = '/lambda_stor/data/yanyul/Framingham/haplotype_po_framingham/preprocess/microarray_gene_annotation.tsv'

In [2]:
# outputs
pedigree_out = '/lambda_stor/data/yanyul/Framingham/haplotype_po_framingham/preprocess/extracted_pedigree.tsv.gz'
expression_full_out = '/lambda_stor/data/yanyul/Framingham/haplotype_po_framingham/preprocess/expression.all_indiv_w_genotype.tsv.gz'
# expression_f_out = '/lambda_stor/data/yanyul/Framingham/haplotype_po_framingham/preprocess/expression.father.tsv.gz'
# expression_m_out = '/lambda_stor/data/yanyul/Framingham/haplotype_po_framingham/preprocess/expression.mother.tsv.gz'
active_individual_list_out = '/lambda_stor/data/yanyul/Framingham/haplotype_po_framingham/preprocess/all_indiv_w_genotype.txt'

In [3]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import gzip
import matplotlib.pyplot as plt
import os

In [4]:
def quantile_norm(x):
    temp = x.argsort()
    ranks = np.empty_like(temp)
    ranks[temp] = np.arange(len(x)) + 1
    return stats.norm.ppf(ranks / (x.shape[0] + 1), loc=0, scale=1)

In [5]:
# load expression matrix
df_expr = pd.read_csv(expression_file, header=0, sep='\t', comment='#')
df_expr.probeset_id = df_expr.probeset_id.astype(str)

In [6]:
# load mapping between SampleID in expression matrix and subjectID used in elsewhere
df_map = pd.read_csv(map_sample2subject, header=0, sep='\t', comment='#', dtype={'SubjectID': str})

In [7]:
# extract the SampleID from expression matrix
# along with the geneID column name for future use
df_expr_columns = pd.DataFrame({'sampleID': df_expr.columns.tolist()[1:]})
first_col_expr = df_expr.columns.tolist()[0]

In [8]:
# annotate the SampleID with SubjectID and other meta information
df_expr_columns = pd.merge(df_expr_columns, df_map, right_on='SampleID', left_on='sampleID', how='inner')

In [9]:
# only keep SampleID with consent == 1 (we're not allowed to use others)
df_expr_columns_consent = df_expr_columns[ df_expr_columns.SubjectConsent == 1 ].reset_index(drop=True)

In [10]:
# prepare the columns to extract from expression matrix 
# and how to rename the columns by SampleID
desired_cols = [first_col_expr] + df_expr_columns_consent.SampleID.tolist()
rename_dict = { df_expr_columns_consent.SampleID[i]: df_expr_columns_consent.SubjectID[i] for i in range(df_expr_columns_consent.shape[0]) }

In [11]:
# extract and rename expression matrix
df_expr_extracted = df_expr[desired_cols].rename(columns=rename_dict)

In [12]:
# load individual list in genotype file
with gzip.open(genotype_file, 'rt') as f:
    for l in f:
        if '##' in l:
            continue
        elif '#CHROM' in l:
            e = l.strip().split('\t')
            e = e[9:]
            break
    indiv_list = e
df_geno_indiv = pd.DataFrame({'SampleID': indiv_list})

In [13]:
# keep SampleID's that have genotype 
df_expr_extracted_indiv = df_expr_extracted.loc[:, df_expr_extracted.columns.isin(df_geno_indiv.SampleID) ].copy()

In [14]:
# quantile normalize each gene
for i in range(df_expr_extracted_indiv.shape[0]):
    df_expr_extracted_indiv.loc[i, :] = quantile_norm(df_expr_extracted_indiv.loc[i, :])
df_expr_extracted_probe = df_expr_extracted[['probeset_id']]
df_expr_extracted = pd.concat((df_expr_extracted_probe, df_expr_extracted_indiv), axis=1)

In [15]:
if not os.path.exists(expression_full_out):
    # save the full expression matrix (for calculation of PEER factors)
    df_expr_extracted.to_csv(expression_full_out, compression='gzip', sep='\t', index=False)

In [16]:
if not os.path.exists(active_individual_list_out):
    # save the list of individuals apears in the full expression matrix (PCA will limit to these individuals)
    with open(active_individual_list_out, 'w') as f:
        for i in df_expr_extracted.columns[1:]:
            f.write(i + '\n')

In [17]:
# # load probe to gene id map
# # and add gene id to probe
# df_probe2gene = pd.read_csv(gene_annot_file, sep='\t', dtype={'probeset_id': str})
# df_expr_extracted = pd.merge(df_expr_extracted, df_probe2gene, 
#                                    left_on='probeset_id', right_on='probeset_id', how='left')

In [18]:
# # shape of the extract expression matrix
# df_expr_extracted.shape

In [19]:
# load pedigree data
df_pedigree = pd.read_csv(pedigree_file, compression='gzip', header=0, sep='\t', comment='#', 
                          dtype={'fshare': str, 'mshare': str, 'shareid': str})

In [20]:
# extract individuals with both father and mother non-missing
df_pedigree_complete = df_pedigree.loc[ 
    (df_pedigree.fshare.isna() == False) & 
    (df_pedigree.mshare.isna() == False) 
]

In [21]:
# furthermore, extract individuals with both parents observed in the expression matrix and themselves also
df_pedigree_complete = df_pedigree_complete[ df_pedigree_complete.fshare.isin(df_expr_extracted.columns) & 
                                           df_pedigree_complete.mshare.isin(df_expr_extracted.columns) & 
                                           df_pedigree_complete.shareid.isin(df_expr_extracted.columns) ]

In [22]:
# share of the leftover pedigree
df_pedigree_complete.shape

(550, 10)

In [23]:
# DEPRECATED
# furthermore, we require all individuals in the extracted pedigree to occur in genotype file
# df_pedigree_complete = df_pedigree_complete[ df_pedigree_complete.fshare.isin(df_geno_indiv.SampleID) & 
#                                            df_pedigree_complete.mshare.isin(df_geno_indiv.SampleID) &
#                                            df_pedigree_complete.shareid.isin(df_geno_indiv.SampleID) ]

In [24]:
# we further require that each mother and father occur only once
df_pedigree_complete = df_pedigree_complete[
    (~ df_pedigree_complete.fshare.duplicated()) & 
    (~ df_pedigree_complete.mshare.duplicated()) ]

In [25]:
df_pedigree_complete.shape

(266, 10)

In [26]:
# ok, save the current extracted pedigree
# future processing will be on the basis of it
if not os.path.exists(pedigree_out):
    df_pedigree_complete.to_csv(pedigree_out, index=False, sep='\t', compression='gzip')

In [27]:
# # next, we extract expression matrix of the fathers and the mothers
# # indiv x gene (add another column to store the SampleID of the father/mother)
# dict_expr_parents = {} 
# for parent in ['fshare', 'mshare']:
#     dict_expr_parents[parent] = df_expr_extracted[ df_pedigree_complete[parent].tolist() ].T
#     # rename the column with geneID
#     # and add SampleID of the fathers as a column
#     dict_expr_parents[parent].columns = df_expr_extracted[first_col_expr].tolist()
#     dict_expr_parents[parent]['pSampleID'] = dict_expr_parents[parent].index
#     dict_expr_parents[parent] = dict_expr_parents[parent].reset_index(drop=True)
#     # add SampleID of the corresponding child
#     dict_expr_parents[parent] = pd.merge(
#         dict_expr_parents[parent], df_pedigree_complete[['shareid', parent]], 
#         left_on='pSampleID', right_on=parent, how='left'
#     )
#     # clean up the redundent columns
#     dict_expr_parents[parent] = dict_expr_parents[parent].drop(columns=[parent]).rename(columns={'shareid': 'SampleID'})