### Linearization

In [1]:
2+2

4

In [2]:
import pandas as pd
import cyvcf2
import numpy as np
import subprocess, re, os, sys
from functools import reduce

In [3]:
imlab_dir = '/lus/grand/projects/TFXcan/imlab'
data_dir = f'{imlab_dir}/data/freedman_data/peaks_liftover'
project_dir = f'{imlab_dir}/users/temi/projects/TFXcan'
hg38_snps_vcf = '/lus/grand/projects/TFXcan/imlab/data/variants_data/hg38_snps/GCF_000001405.40.gz'

### read in the TFPred scores

In [4]:
tfpred_dir = '/lus/grand/projects/TFXcan/imlab/users/temi/projects/TFXcan/experiments/compare_predictors/tfpred_scores'
tfpred_scores = {i: pd.read_csv(f'{tfpred_dir}/aggByCollect_AR_Prostate_1KG.linear.0{i}.txt') for i in [1,2,3,4]}

In [5]:
# I only want loci that are present in all of them
common_loci = [a['locus'].tolist() for a in tfpred_scores.values()]
common_loci = list(set([a for b in common_loci for a in b]))
common_loci[1:5]

['chr18_25931686_25932586',
 'chr8_58480341_58480891',
 'chr21_46188336_46189486',
 'chr2_229512334_229513234']

In [6]:
tfpred_scores = {k: v.loc[v['locus'].isin(common_loci)] for k, v in tfpred_scores.items()}

In [7]:
tfpred_scored_dt = reduce(lambda x, y: pd.merge(x, y, on = 'locus'), tfpred_scores.values())
tfpred_scored_dt.shape, tfpred_scored_dt.iloc[0:5, 0:5]

((18654, 2549),
                        locus   HG00096   HG00101   HG00105   HG00109
 0  chr10_102769243_102770143  0.795932  0.798003  0.780736  0.794168
 1    chr12_22624866_22625316  0.838293  0.844498  0.852188  0.843000
 2     chr9_91422768_91423068  0.626702  0.616125  0.613668  0.617772
 3     chr5_89473083_89473583  1.014096  1.011706  1.008893  1.005967
 4   chr2_225022133_225022683  0.754932  0.759293  0.773871  0.757823)

### ncbi mapping to chromosomes

In [8]:
ncbi_map = pd.read_table(f'{project_dir}/metadata/ncbi_chr_maps.txt')
ncbi_map.head()

Unnamed: 0,Sequence-Name,Sequence-Role,Assigned-Molecule,Assigned-Molecule-Location/Type,GenBank-Accn,Relationship,RefSeq-Accn,Assembly-Unit,Sequence-Length,UCSC-style-name
0,1,assembled-molecule,1,Chromosome,CM000663.2,=,NC_000001.11,Primary Assembly,248956422,chr1
1,2,assembled-molecule,2,Chromosome,CM000664.2,=,NC_000002.12,Primary Assembly,242193529,chr2
2,3,assembled-molecule,3,Chromosome,CM000665.2,=,NC_000003.12,Primary Assembly,198295559,chr3
3,4,assembled-molecule,4,Chromosome,CM000666.2,=,NC_000004.12,Primary Assembly,190214555,chr4
4,5,assembled-molecule,5,Chromosome,CM000667.2,=,NC_000005.10,Primary Assembly,181538259,chr5


In [9]:
hg38_snps_file = pd.read_table(f'{project_dir}/baca_cwas/snps/hg38_snps.bed', names=['chr', 'start', 'end', 'snp_id', 'loci'])
hg38_snps_file.head()

Unnamed: 0,chr,start,end,snp_id,loci
0,chr1,99529849,99529849,rs10493913,chr1:99543994-99544394
1,chr1,99526072,99526072,rs11166261,chr1:99543994-99544394
2,chr1,99531609,99531609,rs11166265,chr1:99543994-99544394
3,chr1,99543051,99543051,rs11166268,chr1:99543994-99544394
4,chr1,99559217,99559217,rs11166272,chr1:99543994-99544394


In [10]:
# read in the loci
cwas_loci = pd.read_table(f'/lus/grand/projects/TFXcan/imlab/users/temi/projects/TFXcan/experiments/compare_predictors/metadata/cwas_intervals.txt', header=None).iloc[:,0].tolist()
cwas_loci[0:5]

['chr1_99543994_99544394',
 'chr1_9943142_9943692',
 'chr1_99598344_99598844',
 'chr1_99615044_99615644',
 'chr1_99766294_99766944']

In [11]:
individuals = pd.read_table(f'/lus/grand/projects/TFXcan/imlab/users/temi/projects/TFXcan/experiments/compare_predictors/metadata/1000_genome_individuals.txt', header=None).iloc[:,0].tolist()
individuals = individuals[0:50]
individuals

['HG00096',
 'HG00097',
 'HG00099',
 'HG00100',
 'HG00101',
 'HG00102',
 'HG00103',
 'HG00104',
 'HG00105',
 'HG00106',
 'HG00107',
 'HG00108',
 'HG00109',
 'HG00110',
 'HG00111',
 'HG00112',
 'HG00113',
 'HG00114',
 'HG00115',
 'HG00116',
 'HG00117',
 'HG00118',
 'HG00119',
 'HG00120',
 'HG00121',
 'HG00122',
 'HG00123',
 'HG00125',
 'HG00126',
 'HG00127',
 'HG00128',
 'HG00129',
 'HG00130',
 'HG00131',
 'HG00132',
 'HG00133',
 'HG00134',
 'HG00135',
 'HG00136',
 'HG00137',
 'HG00138',
 'HG00139',
 'HG00140',
 'HG00141',
 'HG00142',
 'HG00143',
 'HG00145',
 'HG00146',
 'HG00148',
 'HG00149']

In [12]:
def find_variants_in_vcf_file(cyvcf2_object, samples, queries, offset=200000):

    """
    Given a cyvcf2 object and a kipoiseq.Interval object, as well as a list of samples, extract the variants for the samples for the intervals.
    Parameters:
        cyvcf2_object: A cyvcf2 object
        interval_object: a kiposeq.Interval object
            should have `chrom`, `start`, and `end` attributes
        samples: list
            a list of samples: [a, b, c]
            At least one of these samples should be in the vcf file. 
    Returns: dict
        chr: the chromosome
        position: the list of positions
        sample: the samples and variants information
    """

    #n_samples = len(samples)

    # check that samples are in the vcf file
    # if not set(samples).issubset(cyvcf2_object.samples):
    #     raise Exception(f'[ERROR] Fatal. Some samples are not in the VCF file.')

    import re
    variants_dictionary = {}

    for l in queries:
        query_dictionary = {}
        locus_split = re.split(':|-', l)
        qchr = locus_split[0]
        qstart = int(locus_split[1]) - offset
        qend = int(locus_split[2]) + offset
        query = f'{qchr}:{qstart}-{qend}'

        query_dictionary['chr'] = qchr
        query_dictionary['positions'] = tuple(variant.POS for variant in cyvcf2_object(query))
        query_dictionary['ref'] = tuple(variant.REF for variant in cyvcf2_object(query)) 
        query_dictionary['alt'] = tuple(variant.ALT for variant in cyvcf2_object(query)) 

        if not query_dictionary['positions']:
            print(f'{query} does not exist in vcf file')
            continue
        else:
            for i, sample in enumerate(samples):
                try:
                    if sample in cyvcf2_object.samples:
                        sample_variants = tuple([variant.genotypes[i][0:2], variant.gt_bases[i].split('|')] for variant in cyvcf2_object(query))
                        # return(sample_variants)
                        sample_alleles = [''.join(sample_variants[i][1]) for i in range(len(sample_variants))]
                        query_dictionary[sample] = sample_alleles
                except UserWarning:
                    print(f'[WARNING] {sample} is not in the VCF file.')
                    continue
        try:
            vv = pd.DataFrame(query_dictionary)
            colnames = ['chr', 'position', 'ref', 'alt']
            colnames.extend(samples)
            vv.columns = colnames
            vv["locus"] = vv["chr"] + ':' + vv["position"].astype(str) 
            vv = vv.drop(['chr', 'position'], axis=1)
            column_to_move = vv.pop("locus")
            vv.insert(0, "locus", column_to_move)
            variants_dictionary[l] = vv
        except ValueError:
            variants_dictionary[l] = None

    return(variants_dictionary)


def create_locus_info(chr_, loci_list):

    vcf_path = str('/lus/grand/projects/TFXcan/imlab/data/GEUVADIS/vcf_snps_only/ALL.{}.shapeit2_integrated_SNPs_v2a_27022019.GRCh38.phased.vcf.gz')

    queries = []
    for l in loci_list:
        locus_split = l.split('_')
        qchr = locus_split[0]
        qstart = locus_split[1]
        qend = locus_split[2]
        query = f'{qchr}:{qstart}-{qend}'
        queries.append(query)

    return({'vcf_file': vcf_path.replace('{}', chr_), 'queries': queries})



def alleles_to_dosages(aa):

    ref = aa[1]
    alt = aa[2]
    bb = pd.Series(aa[3:])
    update_dict = {f'{ref}{a}':1 for a in alt}
    update_dict.update({f'{a}{ref}':1 for a in alt})
    update_dict.update({f'{a*2}':2 for a in alt})
    update_dict.update({f'{ref*2}':0})
    bb = bb.map(update_dict)
    return(bb)


def create_prediction_matrix(chr_, chr_info, individuals):
    import re, os
    if not os.path.isfile(chr_info['vcf_file']):
        raise Exception(f"ERROR - {chr_info['vcf_file']} does not exist.")

    # read the vcf file + find variants
    chr_vcf_cy = cyvcf2.cyvcf2.VCF(chr_info['vcf_file'], samples=individuals)
    locus_genotypes = find_variants_in_vcf_file(chr_vcf_cy, individuals, chr_info['queries'])
    chr_vcf_cy.close()

    return(locus_genotypes)


def collect_locus_tfpred_score(locus, tfpred_dt = tfpred_scored_dt):
    import re
    nlocus = re.sub(':|-', '_', locus)
    return(tfpred_dt.loc[tfpred_dt['locus'] == nlocus].set_index('locus').T.reset_index())


def create_per_locus_training_matrix(dosages_dt, locus):
    locus_tfpred = collect_locus_tfpred_score(locus=locus)
    result = locus_tfpred.merge(dosages_dt, how='inner', left_on='index', right_on='index')
    return(result)

#### step 1 - group the loci you are interested in by the chromosomes

In [13]:
# create a dictionary of loci separated by chromosome
chromosomes = [f"chr{i}" for i in range(1, 23)]
chromosomes.extend(['chrX', 'chrY'])

available_loci_by_chr = {}
for chromosome in chromosomes:
    chr_list_of_regions = [r for r in cwas_loci[0:11] if r.startswith(f"{chromosome}_")]
    if chr_list_of_regions:
        available_loci_by_chr[chromosome] = chr_list_of_regions
available_loci_by_chr

{'chr1': ['chr1_99543994_99544394',
  'chr1_9943142_9943692',
  'chr1_99598344_99598844',
  'chr1_99615044_99615644',
  'chr1_99766294_99766944',
  'chr1_99806744_99807394',
  'chr1_99836194_99836794',
  'chr1_100037994_100038294',
  'chr1_100132694_100133394',
  'chr1_100202594_100203294',
  'chr1_100239894_100240544']}

#### step 2 - collect locus information (vcf file and location) by chromosome

In [14]:
ltfpred_obj = {k: create_locus_info(k, v) for k, v in available_loci_by_chr.items()}
ltfpred_obj

{'chr1': {'vcf_file': '/lus/grand/projects/TFXcan/imlab/data/GEUVADIS/vcf_snps_only/ALL.chr1.shapeit2_integrated_SNPs_v2a_27022019.GRCh38.phased.vcf.gz',
  'queries': ['chr1:99543994-99544394',
   'chr1:9943142-9943692',
   'chr1:99598344-99598844',
   'chr1:99615044-99615644',
   'chr1:99766294-99766944',
   'chr1:99806744-99807394',
   'chr1:99836194-99836794',
   'chr1:100037994-100038294',
   'chr1:100132694-100133394',
   'chr1:100202594-100203294',
   'chr1:100239894-100240544']}}

### step 3 - get prediction matrices for each locus by chromosome

In [15]:
alleles_matrices = {k: create_prediction_matrix(chr_=k, chr_info = ltfpred_obj[k], individuals=individuals) for k in list(ltfpred_obj.keys())}
alleles_matrices

KeyboardInterrupt: 

#### save this information

In [236]:
save_dir = '/lus/grand/projects/TFXcan/imlab/users/temi/projects/TFXcan/experiments/compare_predictors/prediction_matrices'
if not os.path.isdir(save_dir): os.makedirs(save_dir, exist_ok = True)

In [238]:
for chrK, chrV in alleles_matrices.items():
    chrF = os.path.join(save_dir, chrK)
    if not os.path.isdir(chrF): os.makedirs(chrF, exist_ok = True)
    for k, v in chrV.items():
        v.to_csv(os.path.join(chrF, f'{k}_1KG_alleles.csv'), sep = '\t', index=False)

#### step 4 - convert the allele matrices to dosages

In [247]:
alleles_matrices.items()

dict_items([('chr1', {'chr1:99543994-99544394':                locus ref  alt HG00096 HG00097 HG00099
0      chr1:99344038   A  [C]      AA      AA      AA
1      chr1:99344040   A  [G]      AA      AA      AA
2      chr1:99344044   G  [T]      GG      GG      GG
3      chr1:99344089   C  [T]      CC      CC      CC
4      chr1:99344150   G  [A]      GG      GG      GG
...              ...  ..  ...     ...     ...     ...
10667  chr1:99744164   T  [C]      TT      TT      TT
10668  chr1:99744201   G  [T]      GG      GG      GG
10669  chr1:99744213   G  [A]      GA      GA      GG
10670  chr1:99744222   G  [A]      GG      GG      GG
10671  chr1:99744303   A  [T]      AA      AA      AA

[10672 rows x 6 columns], 'chr1:9943142-9943692':                locus ref  alt HG00096 HG00097 HG00099
0       chr1:9743145   C  [A]      CC      CC      CC
1       chr1:9743176   T  [C]      TT      TT      TT
2       chr1:9743219   G  [A]      GG      GG      GG
3       chr1:9743330   C  [T]      CC

In [251]:
dosages_matrices = {}
for chrK, chrV in alleles_matrices.items():
    dosages_matrices[chrK] = {k: v.apply(alleles_to_dosages, axis=1).set_index(v['locus']).T.reset_index() for k, v in chrV.items() if v is not None}

dosages_matrices

{'chr1': {'chr1:99543994-99544394': locus    index  chr1:99344038  chr1:99344040  chr1:99344044  chr1:99344089   
  0      HG00096              0              0              0              0  \
  1      HG00097              0              0              0              0   
  2      HG00099              0              0              0              0   
  
  locus  chr1:99344150  chr1:99344155  chr1:99344169  chr1:99344215   
  0                  0              0              0              0  \
  1                  0              0              0              0   
  2                  0              0              0              0   
  
  locus  chr1:99344228  ...  chr1:99744013  chr1:99744059  chr1:99744064   
  0                  0  ...              0              1              0  \
  1                  0  ...              0              1              0   
  2                  0  ...              0              0              0   
  
  locus  chr1:99744075  chr1:99744153  chr1:99744

In [252]:
for chrK, chrV in dosages_matrices.items():
    chrF = os.path.join(save_dir, chrK)
    if not os.path.isdir(chrF): os.makedirs(chrF, exist_ok = True)
    for k, v in chrV.items():
        v.to_csv(os.path.join(chrF, f'{k}_1KG_dosages.csv'), sep = '\t', index=False)

#### save the dosages

In [275]:
prediction_matrices = {}
for chrK, chrV in dosages_matrices.items():
    prediction_matrices[chrK] = {k: create_per_locus_training_matrix(v, k) for k, v in chrV.items() if v is not None}


for chrK, chrV in prediction_matrices.items():
    chrF = os.path.join(save_dir, chrK)
    if not os.path.isdir(chrF): os.makedirs(chrF, exist_ok = True)
    for k, v in chrV.items():
        v.to_csv(os.path.join(chrF, f'{k}_1KG_prediction_matrix.csv'), sep = '\t', index=False)

In [278]:
# save the list of loci to be trained

training_loci = []
for chrK, chrV in prediction_matrices.items():
    training_loci.extend(list(chrV.keys()))

pd.DataFrame(training_loci).to_csv(os.path.join(chrF, '..', f'training_locus_1KG.csv'), sep = '\t', index=False, header=False)

### done

In [None]:
tfpred_dir = '/lus/grand/projects/TFXcan/imlab/users/temi/projects/TFXcan/experiments/compare_predictors/tfpred_scores'
tfpred_scores = {i: pd.read_csv(f'{tfpred_dir}/aggByCollect_AR_Prostate_1KG.linear.0{i}.txt') for i in [1,2,3,4]}

In [None]:
# I only want loci that are present in all of them
common_loci = [a['locus'].tolist() for a in tfpred_scores.values()]
common_loci = list(set([a for b in common_loci for a in b]))
common_loci[1:5]

['chr6_88796831_88797481',
 'chr15_72173709_72174459',
 'chr12_125981554_125982254',
 'chr7_134703748_134704298']

In [None]:
tfpred_scores = {k: v.loc[v['locus'].isin(common_loci)] for k, v in tfpred_scores.items()}

In [None]:
from functools import reduce
tfpred_scored_dt = reduce(lambda x, y: pd.merge(x, y, on = 'locus'), tfpred_scores.values())
tfpred_scored_dt.shape, tfpred_scored_dt.iloc[0:5, 0:5]

((18654, 2549),
                        locus   HG00096   HG00101   HG00105   HG00109
 0  chr10_102769243_102770143  0.795932  0.798003  0.780736  0.794168
 1    chr12_22624866_22625316  0.838293  0.844498  0.852188  0.843000
 2     chr9_91422768_91423068  0.626702  0.616125  0.613668  0.617772
 3     chr5_89473083_89473583  1.014096  1.011706  1.008893  1.005967
 4   chr2_225022133_225022683  0.754932  0.759293  0.773871  0.757823)

In [153]:
"chr7_75841602_75842601" in tfpred_scored_dt['locus'].to_list()

False

In [146]:
tfpred_scored_dt['locus'].to_list()

['chr10_102769243_102770143',
 'chr12_22624866_22625316',
 'chr9_91422768_91423068',
 'chr5_89473083_89473583',
 'chr2_225022133_225022683',
 'chr14_32307944_32308644',
 'chr19_20588994_20589694',
 'chr5_168076445_168076895',
 'chr5_90893383_90893983',
 'chr12_13135516_13136466',
 'chr19_37957110_37957660',
 'chr2_180934723_180935473',
 'chr10_3538408_3538808',
 'chr14_55317782_55318282',
 'chr9_24142202_24142752',
 'chr5_137403911_137404461',
 'chr22_31870964_31871614',
 'chr20_49832513_49833113',
 'chr4_87521598_87522398',
 'chr18_59075768_59076318',
 'chr7_96899038_96899588',
 'chr3_74408949_74409499',
 'chr15_60439751_60440401',
 'chr6_66189907_66190307',
 'chr21_17372031_17372681',
 'chr11_43489700_43490400',
 'chr5_92946143_92946693',
 'chr13_95146546_95146996',
 'chr17_51608189_51608989',
 'chr16_58756846_58757496',
 'chr1_31258303_31259203',
 'chr7_107169555_107170105',
 'chr3_136232258_136232858',
 'chr8_51898890_51899340',
 'chr2_49306611_49307661',
 'chr2_216620927_216621977

In [25]:
#genotypes.apply(alleles_to_dosages, axis=1).T.set_axis(genotypes['locus'], axis=1)

In [131]:
re.sub(':|-', '_', 'chr12:114793197-23')

'chr12_114793197_23'

In [17]:
aa = genotypes.loc[genotypes['locus'] == 'chr12:114793197']
aa

Unnamed: 0,locus,ref,alt,HG00097,HG00099,HG00100,HG00101,HG00102,HG00103,HG00104,...,HG00107,HG00108,HG00109,HG00110,HG00111,HG00112,HG00113,HG00114,HG00115,HG00116
0,chr12:114793197,G,[T],GG,GG,GG,GG,GG,GG,GG,...,GG,GG,GG,GG,GG,GG,GG,GG,GG,GG


In [18]:
ref = aa.iloc[0,1]
alt = aa.iloc[0,2]
bb = aa.iloc[0,3:]

update_dict = {f'{ref}{a}':1 for a in alt}
update_dict.update({f'{a}{ref}':1 for a in alt})
update_dict.update({f'{a*2}':2 for a in alt})
update_dict.update({f'{ref*2}':0})
bb.map(update_dict)
update_dict, bb

({'GT': 1, 'TG': 1, 'TT': 2, 'GG': 0},
 HG00097    GG
 HG00099    GG
 HG00100    GG
 HG00101    GG
 HG00102    GG
 HG00103    GG
 HG00104    GG
 HG00105    GG
 HG00106    GG
 HG00107    GG
 HG00108    GG
 HG00109    GG
 HG00110    GG
 HG00111    GG
 HG00112    GG
 HG00113    GG
 HG00114    GG
 HG00115    GG
 HG00116    GG
 Name: 0, dtype: object)

In [9]:
locus = 'NC_000019.10:44954743-44955443' #chr8

In [10]:
f'conda activate compbio-tools\nbcftools view -H -r {locus} {hg38_snps_vcf}'

'conda activate compbio-tools\nbcftools view -H -r NC_000019.10:44954743-44955443 /lus/grand/projects/TFXcan/imlab/data/variants_data/hg38_snps/GCF_000001405.40.gz'

In [None]:
subprocess.run(shell=True, capture_output=True)

In [None]:
vcf_chr = cyvcf2.cyvcf2.VCF(hg38_snps_vcf)

In [3]:
pmatrix = pd.read_table(f'/lus/grand/projects/TFXcan/imlab/users/temi/projects/TFXcan/experiments/compare_predictors/prediction_matrices/chr8/chr8:84690965-84691515_1KG_prediction_matrix.csv')

pmatrix.iloc[0:5, 0:5]
        

Unnamed: 0,index,chr8_84690965_84691515,chr8:84490982,chr8:84490995,chr8:84491030
0,HG00096,0.313102,0,2,0
1,HG00101,0.314852,0,0,0
2,HG00105,0.336786,0,1,0
3,HG00109,0.342701,0,1,0
4,HG00113,0.33796,0,1,0


In [4]:
pmatrix.shape

(20, 10035)