In [3]:
import cv2, os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import scipy.stats as stats
import matplotlib.pyplot as plt

from assocplots.manhattan import *
from assocplots.qqplot import *
from tqdm import tqdm

In [25]:
#### SNPs annotations
snps_loc = pd.read_csv('../../data/genotype_information/snps_annotations_genome-version-3-64-1.txt').rename(columns={'snp_id':'SNP'})
snps_loc['SNP'] = snps_loc['SNP'].astype(int)
snps_loc['SNP_ID'] = [ f'{snps_loc["chrom"][idx].replace("_","")}_{snps_loc["SNP"][idx]}' for idx in snps_loc.index]
snps_loc['position'] = snps_loc['position'].astype(int)

#### Gene names 
gene_loc = pd.read_csv('../../data/genotype_information/yeast_ORFs_dec2022.txt')

#### Genotype matrix
genotype = pd.read_csv('../../data/genotype_information/piQTL_genotype_matrix_dec2022.txt')
nb_strains = len(genotype.columns) - 1

In [3]:
genotype

Unnamed: 0,snp_id,6,7,14,15,17,20,22,27,29,...,1038,1040,1046,1050,1052,1065,1069,17_2,40_2,180_2
0,1,1,0,1,0,1,0,1,-1,0,...,-1,-1,-1,-1,-1,-1,0,1,1,-1
1,2,1,1,-1,-1,1,0,1,-1,-1,...,-1,0,0,-1,0,0,0,1,1,-1
2,3,0,0,-1,-1,0,-1,0,0,0,...,-1,0,0,-1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,-1,0,0,-1,-1,0,0,0
4,5,-1,-1,-1,-1,-1,1,-1,-1,-1,...,1,0,-1,0,0,1,-1,-1,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,12050,1,1,1,-1,-1,1,-1,1,-1,...,-1,-1,-1,-1,1,1,0,-1,1,-1
12050,12051,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,1,-1,1,0,0,-1
12051,12052,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12052,12053,1,0,1,0,0,-1,-1,0,-1,...,0,0,0,0,0,1,0,0,0,0


In [3]:
gene_loc

Unnamed: 0,locus_id,name,sgd_id,chrom,start,end
0,YAL069W,YAL069W,SGDID:S000002143,CHR_1,335,649
1,YAL068W-A,YAL068W-A,SGDID:S000028594,CHR_1,538,792
2,YAL068C,PAU8,SGDID:S000002142,CHR_1,1807,2169
3,YAL067W-A,YAL067W-A,SGDID:S000028593,CHR_1,2480,2707
4,YAL067C,SEO1,SGDID:S000000062,CHR_1,7235,9016
...,...,...,...,...,...,...
6602,Q0182,Q0182,SGDID:S000007280,CHR_1000,65770,66174
6603,Q0250,COX2,SGDID:S000007281,CHR_1000,73758,74513
6604,Q0255,Q0255,SGDID:S000007282,CHR_1000,74495,75622
6605,Q0275,COX3,SGDID:S000007283,CHR_1000,79213,80022


In [23]:
def get_chr(x, gene_loc):
    try : 
        return gene_loc[gene_loc['name'] == x]['chrom'].values[0].replace('CHR_', '')
    except : 
        return np.nan

In [26]:
PPI_list = pd.read_csv('../../data/pipeline/PPI_reference_barcodes.csv')
PPI_list['PPI_bait'] =  [ PPI_list['PPI'][idx].split('_')[0] for idx in PPI_list.index ]
PPI_list['PPI_prey'] =  [ PPI_list['PPI'][idx].split('_')[1] for idx in PPI_list.index ]

PPI_list['CHR_bait'] = PPI_list['PPI_bait'].apply(get_chr, args=(gene_loc,))
PPI_list['CHR_prey'] = PPI_list['PPI_prey'].apply(get_chr, args=(gene_loc,))

In [6]:
PPI_list

Unnamed: 0,well_position,ppi,N-index,S-index,N_barcode,S_barcode,PPI,PPI_bait,PPI_prey,CHR_bait,CHR_prey
0,A1,GTR1:SLM4,N701,S501,TCGCCTTA,TAGATCGC,GTR1_SLM4,GTR1,SLM4,13,2
1,A2,SNA4:GTR2,N702,S501,CTAGTACG,TAGATCGC,SNA4_GTR2,SNA4,GTR2,4,7
2,A3,ERG11:HYR1,N703,S501,TTCTGCCT,TAGATCGC,ERG11_HYR1,ERG11,HYR1,8,9
3,A4,DHH1:LSM3,N704,S501,GCTCAGGA,TAGATCGC,DHH1_LSM3,DHH1,LSM3,4,12
4,A5,MID2:PIN2,N705,S501,AGGAGTCC,TAGATCGC,MID2_PIN2,MID2,PIN2,12,15
...,...,...,...,...,...,...,...,...,...,...,...
57,H4,MID2:MAM33,N704,S508,GCTCAGGA,CTAAGCCT,MID2_MAM33,MID2,MAM33,12,9
58,H5,MID2:WSC2,N705,S508,AGGAGTCC,CTAAGCCT,MID2_WSC2,MID2,WSC2,12,14
59,H7,HNM1:TPO2,N707,S508,GTAGAGAG,CTAAGCCT,HNM1_TPO2,HNM1,TPO2,7,7
60,H8,ALO1:ADE17,N708,S508,CCTCTCTG,CTAAGCCT,ALO1_ADE17,ALO1,ADE17,13,13


In [7]:
snps_loc

Unnamed: 0,SNP,chrom,position,REF,ALT,locus_id,name,sgd_id,description,snps_class_up,genome_annotations,snps_class_down,SNP_ID
0,1,CHR_1,1025,C,T,X1L_NFR/NDR,,,,Promoter,Promoter,Promoter,CHR1_1
1,2,CHR_1,1035,A,T,X1L_NFR/NDR,,,,Promoter,Promoter,Promoter,CHR1_2
2,3,CHR_1,1074,C,A,YAL069W,YAL069W,SGDID:S000002143,"""Dubious open reading frame; unlikely to encod...",Close to 3'-UTR,Dubious ORF,Close to 3'-UTR,CHR1_3
3,4,CHR_1,1470,A,G,YAL069W,YAL069W,SGDID:S000002143,"""Dubious open reading frame; unlikely to encod...",Close to 3'-UTR,Dubious ORF,Close to 3'-UTR,CHR1_4
4,5,CHR_1,1475,AT,A,YAL069W,YAL069W,SGDID:S000002143,"""Dubious open reading frame; unlikely to encod...",Close to 3'-UTR,Dubious ORF,Close to 3'-UTR,CHR1_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12049,12050,CHR_MT,74432,TG,T,Q0250,COX2,SGDID:S000007281,"""Subunit II of cytochrome c oxidase (Complex I...",Close to 3'-UTR,Verified ORF,Close to 3'-UTR,CHRMT_12050
12050,12051,CHR_MT,74433,G,GGACTA,Q0250,COX2,SGDID:S000007281,"""Subunit II of cytochrome c oxidase (Complex I...",Close to 3'-UTR,Verified ORF,Close to 3'-UTR,CHRMT_12051
12051,12052,CHR_MT,77708,T,"A,G",Q:77506-78088,,,between YNCQ0023W and YNCQ0024C,Intergenic region,Intergenic region,Intergenic region,CHRMT_12052
12052,12053,CHR_MT,84245,G,"GCC,GTC",Q:82601-85034,,,between ORI5 and YNCQ0026W,Intergenic region,Intergenic region,Intergenic region,CHRMT_12053


In [14]:
input_path = f'../../results/04a_rMVP/rMVP_outputs/piQTL_dec_2022/'
output_path = f'../../results/04a_rMVP/rMVP_outputs/piQTL_dec_2022_annotated/'
for table in tqdm(os.listdir(os.path.join(input_path))) :
    qtl_res = pd.read_csv(os.path.join(input_path, table))
    qtl_res = qtl_res.merge(snps_loc[['SNP', 'locus_id', 'name','sgd_id', 'snps_class_up', 'snps_class_down', 'description',  'genome_annotations']], on='SNP')
    qtl_res.to_csv(os.path.join(output_path, table), index=False)

100%|██████████| 620/620 [03:47<00:00,  2.72it/s]


In [None]:
! cp ../../results/04a_rMVP/rMVP_outputs/piQTL_dec_2022_annotated/*.csv /home/savvy/PROJECTS/PHD/piQTL/data/QTL

In [1]:
! cp ../../results/04a_rMVP/rMVP_outputs/piQTL_dec_2022_annotated/*.csv /home/savvy/PROJECTS/PHD/piQTL/data/QTL_v2

In [None]:
folder_path = '/home/savvy/PROJECTS/PHD/piQTL/data/QTL'
for PPI in tqdm(PPI_list['PPI']): 
    for DRUG in ['noDrug', '5.FC', 'Fluconazole', 'Metformin', 'Trifluoperazine'] : 
        # print(PPI)
        CHR_bait =  PPI_list[PPI_list['PPI'] == PPI]['CHR_bait'].values[0] 
        CHR_prey = PPI_list[PPI_list['PPI'] == PPI]['CHR_prey'].values[0] 
        # print(CHR_prey, CHR_bait)

        MTX = pd.read_csv(os.path.join(folder_path,f'{PPI}_MTX_{DRUG}_avg_logratio_Fitness_minus_ref.csv')).replace('MT', 17)
        MTX = noMTX.rename(columns={'Chr':'CHR', 'Pos':'BP', f'{PPI}_MTX_{DRUG}_avg_logratio_Fitness_minus_ref.GLM':'P', 'Effect':'EFFECTSIZE', 'name':'GENE'})
        # MTX.to_csv(os.path.join(folder_path,f'{PPI}_MTX_{DRUG}_avg_logratio_Fitness_minus_ref.csv'), index=False)
        
        noMTX = pd.read_csv(os.path.join(folder_path,f'{PPI}_noMTX_{DRUG}_avg_logratio_Fitness_minus_ref.csv')).replace('MT', 17)
        noMTX = noMTX.rename(columns={'Chr':'CHR', 'Pos':'BP', f'{PPI}_noMTX_{DRUG}_avg_logratio_Fitness_minus_ref.GLM':'P', 'Effect':'EFFECTSIZE', 'name':'GENE'})
        # noMTX.to_csv(os.path.join(folder_path,f'{PPI}_noMTX_{DRUG}_avg_logratio_Fitness_minus_ref.csv'), index=False)

        

In [4]:
eQTL_res = pd.read_csv('../../results/04b_eQTL_matrix/piQTL_dec2022_pval_results.txt', sep='\t')

In [27]:
folder_path = '/home/savvy/PROJECTS/PHD/piQTL/data/QTL_v2'
for PPI in tqdm(PPI_list['PPI']): 
    for DRUG in ['noDrug', '5.FC', 'Fluconazole', 'Metformin', 'Trifluoperazine'] : 
        # print(PPI)
        CHR_bait =  PPI_list[PPI_list['PPI'] == PPI]['CHR_bait'].values[0] 
        CHR_prey = PPI_list[PPI_list['PPI'] == PPI]['CHR_prey'].values[0] 
        # print(CHR_prey, CHR_bait)

        MTX = pd.read_csv(os.path.join(folder_path,f'{PPI}_MTX_{DRUG}_avg_logratio_Fitness_minus_ref.csv')).replace('MT', 17)
        eQTL_MTX_RES = eQTL_res[(eQTL_res['gene'].str.contains(PPI)) & (eQTL_res['gene'].str.contains(DRUG)) & (eQTL_res['gene'].str.contains('_MTX'))]
        MTX = eQTL_MTX_RES.merge(MTX, on='SNP')
        MTX = MTX.rename(columns={'Chr':'CHR', 'Pos':'BP', 'p-value':'P', 'beta':'EFFECTSIZE', 'name':'GENE'})
        MTX.to_csv(os.path.join(folder_path,f'{PPI}_MTX_{DRUG}_avg_logratio_Fitness_minus_ref.csv'), index=False)
        
        noMTX = pd.read_csv(os.path.join(folder_path,f'{PPI}_noMTX_{DRUG}_avg_logratio_Fitness_minus_ref.csv')).replace('MT', 17)
        eQTL_noMTX_RES = eQTL_res[(eQTL_res['gene'].str.contains(PPI)) & (eQTL_res['gene'].str.contains(DRUG)) & (eQTL_res['gene'].str.contains('_noMTX'))]
        noMTX = eQTL_noMTX_RES.merge(noMTX, on='SNP')
        noMTX = MTX.rename(columns={'Chr':'CHR', 'Pos':'BP', 'p-value':'P', 'beta':'EFFECTSIZE', 'name':'GENE'})  
        noMTX.to_csv(os.path.join(folder_path,f'{PPI}_noMTX_{DRUG}_avg_logratio_Fitness_minus_ref.csv'), index=False)


100%|██████████| 62/62 [2:39:26<00:00, 154.30s/it]  
