In [1]:
from platform import python_version
print(python_version())

3.10.2


In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import time
import itertools

from multiprocessing import Pool
import statsmodels.formula.api as sm
import random

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# setting path
import functions_compute_effect_sizes
import functions_loading_data

In [3]:
####### Options #############

# List sensitivity analyses
list_ancestries = ["All"]# "no_EUR" "EUR"
list_diagnosis = ["wo_ASD"] # "w_ASD" "All"
list_max_age = [12000]#, 75*12, 70*12, 65*12,  55*12
list_max_scores = [10000]#, 4, 6, 10, 20 ,40, 80

# Other options
cpu_count=1 # Number of CPUs to use
score_names = ["oe_lof_upper"]

other_covariates = ' + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10' # Other covariates to include in the model
pheno_score = 'ZScore_IQ_adj_test2_age_sex_with_online_UKBB_Final' # Name of the column containing the phenotype score (continuous variable)

####### Folder path containing all gene lists in tsv format
genesets_file = '<Path of the file containing genes associated to biological functions>'

# Ouptut pathway
output = "<Path to store the results>"


In [6]:
# Load the genesets file with two columns: the first one is the name of the geneset and the second one is the list of genes separated by ";"
genesets = pd.read_csv("{}".format(genesets_file), sep='\t')
genesets['count'] = genesets['1'].str.count(';') + 1
genesets = genesets[(genesets['count']>=10)]
print(genesets.shape)

                   0                                                  1
32  substantia nigra  ENSG00000001084;ENSG00000001461;ENSG0000000156...
33            testis  ENSG00000000460;ENSG00000001167;ENSG0000000146...
34     thyroid gland  ENSG00000001167;ENSG00000001631;ENSG0000000201...
35   urinary bladder  ENSG00000000457;ENSG00000001084;ENSG0000000163...
36            vagina  ENSG00000000457;ENSG00000001617;ENSG0000000274...
(37, 3)


In [5]:
# Loading the dataset into 3 files, the first one containing the phenotypic information for each individual, the second one containing the list of CNV information for each individual and the third one containing the gene scores and information for each unique CNV.
# Only genes fully covered by the CNV are considered.
individual_info, cnv_indd, gene_score_info_overlap_split = functions_loading_data.prepare_Megadataset_dataframes(score_names)
# Load the list of genes associated to the DDD genes to be used as a covariate in the model
DDD_genes = functions_loading_data.get_DDD_genes()

  individual_info=pd.read_csv(ind_info_file, sep= '\t')
  clean_individual_list = pd.read_csv(clean_data, sep="\t")


In [6]:
# Create list of score names based on the given list of categories
cat_list = ['1_gene_list', '2_gene_list', '3_gene_list', '1_outside', '2_outside', 'ddd']
list_scores  = []
for cat in cat_list:
    list_scores.append("LOEUF_cat_{}".format(cat))

In [7]:
def runModel(
        data: pd.DataFrame,
        model_name: str,
        cnv_type: str,
        pheno_score: str,
        list_scores: list,
        gene_list: list,
        other_covariates: str) -> pd.DataFrame:
    
    len_covariates = len(other_covariates.split(sep='+')) -1
    if model_name == "1_3":
        tmp_scores = ['{}_{}'.format(cnv_type, val) for val in list_scores if not (val.endswith("gene_list"))]
        data.eval('{0}_LOEUF_gene_list = {0}_LOEUF_cat_1_gene_list + {0}_LOEUF_cat_2_gene_list + {0}_LOEUF_cat_3_gene_list'.format(cnv_type),
            inplace=True)
        formula = "IQ_shuffle ~ {}_LOEUF_gene_list + {}".format(cnv_type, ' + '.join(tmp_scores)) + other_covariates
        cat = [None, 'g', 'o', 'o', 'o']
        win = [0, 1, 0, 1, 2]
        
    # Run the linear regression following formula and based on data. Capture error and print exception
    try:
        reg = sm.gls(formula, data=data).fit()
    except Exception as e:
        print("Error: ", e)
        return pd.DataFrame()

    return pd.DataFrame({'Estimate': reg.params[1:2],
                         'pvalue': reg.pvalues[1:2],
                         'TYPE': cnv_type,
                         }
                        )

In [8]:
## Uncomment if you want to run the model with split gene lists

# def runModel(
#         data: pd.DataFrame,
#         model_name: str,
#         cnv_type: str,
#         pheno_score: str,
#         list_scores: list,
#         gene_list: list,
#         other_covariates: str) -> pd.DataFrame:
    
#     len_covariates = len(other_covariates.split(sep='+')) -1
#     if model_name == "3_3":
#         tmp_scores = ['{}_{}'.format(cnv_type, val) for val in list_scores]
#         formula = "IQ_shuffle ~ {}".format(' + '.join(tmp_scores)) + other_covariates
#         win = [1, 2, 3]
        
#     # Run the linear regression following formula and based on data. Capture error and print exception
#     try:
#         reg = sm.gls(formula, data=data).fit()
#     except Exception as e:
#         print("Error: ", e)
#         return pd.DataFrame()

#     return pd.DataFrame({'Estimate': reg.params[1:4],
#                          'pvalue': reg.pvalues[1:4],
#                          'TYPE': cnv_type,
#                          'win':win
#                          }
#                         )

In [None]:
genes_by_individual = functions_compute_effect_sizes.aggregate_ind_cnv_genes(cnv_indd, gene_score_info_overlap_split.loc[:,['CHR', 'START', 'STOP', 'TYPE', 'gene_id']], individual_info)

In [9]:
def compute_sum_score_one_gene_list(genes_annotation, genes_by_individual, gene_set):    
    print(gene_set[0])
    start_time = time.time()

    list_name = gene_set[0]
    gene_string = gene_set[1]
    genes = gene_string.split(';')
    gene_list = pd.DataFrame({'gene_id':genes})
    genes_annotation_gene_sets = functions_compute_effect_sizes.compute_loeuf_cat_specific_genes(genes_annotation, "gene_id", gene_list.iloc[:,0].tolist(), DDD_genes.gene_id.tolist())
    merged = pd.merge(genes_by_individual, genes_annotation_gene_sets, on='gene_id', how='left')
    counted_cnvs = {}
    clean_list_scores = list_scores+['loeuf_inv_full']
    for cnv_type in ['DEL', 'DUP']:
        merged_cnv = merged[merged.TYPE == cnv_type]
        counts = merged_cnv.groupby('individual', as_index=False)[clean_list_scores].sum()
        counts = counts.rename(columns={c: cnv_type+'_'+c for c in counts.columns if c in clean_list_scores})
        counted_cnvs[cnv_type] = counts

    combinations = list(itertools.product(['DEL', 'DUP'], clean_list_scores))
    joined_list = ['_'.join(combination) for combination in combinations]

    merged_counts = pd.merge(counted_cnvs['DEL'], counted_cnvs['DUP'], on='individual', how='outer')
    clean_FINAL_SUM_ALL = pd.merge(individual_info, merged_counts,  on='individual', how='left')
    
    clean_FINAL_SUM_ALL[joined_list] = clean_FINAL_SUM_ALL[joined_list].replace({np.nan:0.0})
    clean_FINAL_SUM_ALL = clean_FINAL_SUM_ALL[~clean_FINAL_SUM_ALL.Cohort.isin(["SSC", "SPARK", "MSSNG"])]

    print("-- %s seconds --" % (time.time() - start_time), flush=True)

    model_summary = pd.DataFrame()
    
    for i in range(1000):
        clean_FINAL_SUM_ALL['IQ_shuffle'] = random.Random(i).sample(clean_FINAL_SUM_ALL[pheno_score].tolist(), len(clean_FINAL_SUM_ALL[pheno_score]))
        for cnv_type in ["DEL", "DUP"]:
            tmp_summary = runModel(clean_FINAL_SUM_ALL, "1_3", cnv_type, pheno_score, list_scores, gene_list, other_covariates)
            tmp_summary['permut'] = i
            model_summary = pd.concat([model_summary, tmp_summary])
        
    model_summary["gene_list_name"] = gene_set[0]
    print("-- %s seconds --" % (time.time() - start_time), flush=True)
    return (model_summary)

In [11]:
from functools import partial
partial_func = partial(compute_sum_score_one_gene_list, gene_score_info_overlap_split)

pool = Pool(cpu_count)
start_all = time.time()
data_outputs = pool.map(partial_func ,genesets.values.tolist())
pool.close()
pool.join()
print("-- %s seconds all--" % (time.time() - start_all))
df_reconstructed = pd.concat(data_outputs)
df_reconstructed.to_csv(output, sep="\t", index=False)

GO:0006122
-- 3.6715049743652344 seconds --
-- 11.17102599143982 seconds --


Unnamed: 0,Estimate,pvalue,TYPE,permut,gene_list_name
DEL_LOEUF_gene_list,-0.01212,0.911819,DEL,0,GO:0006122
DUP_LOEUF_gene_list,0.136868,0.265844,DUP,0,GO:0006122
DEL_LOEUF_gene_list,-0.091134,0.435502,DEL,1,GO:0006122
DUP_LOEUF_gene_list,-0.095426,0.433631,DUP,1,GO:0006122
DEL_LOEUF_gene_list,0.084872,0.459519,DEL,2,GO:0006122
DUP_LOEUF_gene_list,0.241634,0.052679,DUP,2,GO:0006122
DEL_LOEUF_gene_list,-0.056968,0.604548,DEL,3,GO:0006122
DUP_LOEUF_gene_list,-0.021281,0.858044,DUP,3,GO:0006122
DEL_LOEUF_gene_list,-0.004443,0.96893,DEL,4,GO:0006122
DUP_LOEUF_gene_list,0.098544,0.409214,DUP,4,GO:0006122
