In [None]:
import numpy as np
import pandas as pd
import time
from multiprocessing import Pool
import functions_compute_effect_sizes
import functions_loading_data
import warnings
import itertools

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
####### Options #############

# List sensitivity analyses
list_ancestries = ["All"]# "no_EUR" "EUR"
list_diagnosis = ["wo_ASD"] # "w_ASD" "All"
list_max_age = [12000]#, 75*12, 70*12, 65*12,  55*12
list_max_scores = [10000]#, 4, 6, 10, 20 ,40, 80

# Other options
cpu_count=1 # Number of CPUs to use
score_names = ["oe_lof_upper"]

other_covariates = ' + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10' # Other covariates to include in the model
pheno_score = 'ZScore_IQ_adj_test2_age_sex_with_online_UKBB_Final' # Name of the column containing the phenotype score (continuous variable)

####### Folder path containing all gene lists in tsv format
genesets_file = '<Path of the file containing genes associated to biological functions>'

# Ouptut pathway
output = "<Path to store the results>"


In [None]:
# Load the genesets file with two columns: the first one is the name of the geneset and the second one is the list of genes separated by ";"
genesets = pd.read_csv("{}".format(genesets_file), sep='\t')
genesets['count'] = genesets['1'].str.count(';') + 1
genesets = genesets[(genesets['count']>=10)]
print(genesets.shape)

In [None]:
# Loading the dataset into 3 files, the first one containing the phenotypic information for each individual, the second one containing the list of CNV information for each individual and the third one containing the gene scores and information for each unique CNV.
# Only genes fully covered by the CNV are considered.
individual_info, cnv_indd, gene_score_info_overlap_split = functions_loading_data.prepare_Megadataset_dataframes(score_names)
# Load the list of genes associated to the DDD genes to be used as a covariate in the model
DDD_genes = functions_loading_data.get_DDD_genes()

In [None]:
# Load all protein-coding genes with the a LOEUF score
genes_annotation = pd.read_csv('<Path to the dataframe containing the whole coding genome and the scores associated>', sep='\t', usecols=['gene_id', 'gene_type'] + score_names)
genes_annotation = genes_annotation[(~genes_annotation.oe_lof_upper.isna()) & (genes_annotation.gene_type=='protein_coding')]
print(genes_annotation.shape)

In [None]:
# Create list of score names based on the given list of categories
cat_list = ['1_gene_list', '2_gene_list', '3_gene_list', '1_outside', '2_outside', 'ddd']
list_scores  = []
for cat in cat_list:
    list_scores.append("LOEUF_cat_{}".format(cat))
    
# Create dictonary containing list of unique genes carried by each individual for each category as key
list_uniques = functions_compute_effect_sizes.create_dict_uniques_genes_by_loeuf_cat(gene_score_info_overlap_split, cnv_indd)

In [None]:
def compute_sum_score_one_gene_list(genes_annotation, genes_by_individual, gene_set):    
    print(gene_set[0])
    start_time = time.time()

    list_name = gene_set[0]
    gene_string = gene_set[1]
    genes = gene_string.split(';')
    gene_list = pd.DataFrame({'gene_id':genes})
    genes_annotation_gene_sets = functions_compute_effect_sizes.compute_loeuf_cat_specific_genes(genes_annotation, "gene_id", gene_list.iloc[:,0].tolist(), DDD_genes.gene_id.tolist())
    merged = pd.merge(genes_by_individual, genes_annotation_gene_sets, on='gene_id', how='left')
    counted_cnvs = {}
    clean_list_scores = list_scores+['loeuf_inv_full']
    for cnv_type in ['DEL', 'DUP']:
        merged_cnv = merged[merged.TYPE == cnv_type]
        counts = merged_cnv.groupby('individual', as_index=False)[clean_list_scores].sum()
        counts = counts.rename(columns={c: cnv_type+'_'+c for c in counts.columns if c in clean_list_scores})
        counted_cnvs[cnv_type] = counts

    combinations = list(itertools.product(['DEL', 'DUP'], clean_list_scores))
    joined_list = ['_'.join(combination) for combination in combinations]

    merged_counts = pd.merge(counted_cnvs['DEL'], counted_cnvs['DUP'], on='individual', how='outer')
    clean_FINAL_SUM_ALL = pd.merge(individual_info, merged_counts,  on='individual', how='left')
    
    clean_FINAL_SUM_ALL[joined_list] = clean_FINAL_SUM_ALL[joined_list].replace({np.nan:0.0})
    print("-- %s seconds --" % (time.time() - start_time), flush=True)


    model_summary = pd.DataFrame()
    for ancestry in list_ancestries:
        if ancestry == "EUR":
            an_FINAL_SUM_ALL = clean_FINAL_SUM_ALL[clean_FINAL_SUM_ALL.Merge_final_ancestry == "EUR"]
        elif ancestry == "All":
            an_FINAL_SUM_ALL = clean_FINAL_SUM_ALL
        elif ancestry == "no_EUR":
            an_FINAL_SUM_ALL = clean_FINAL_SUM_ALL[clean_FINAL_SUM_ALL.Merge_final_ancestry != "EUR"]
        else:
            exit()
#             
        for diag in list_diagnosis:
            if diag == "w_ASD":
                diag_an_FINAL_SUM_ALL = an_FINAL_SUM_ALL[an_FINAL_SUM_ALL.Cohort.isin(["SSC", "SPARK", "MSSNG"])]
            elif diag == "All":
                diag_an_FINAL_SUM_ALL = an_FINAL_SUM_ALL
            elif diag == "wo_ASD":
                diag_an_FINAL_SUM_ALL = an_FINAL_SUM_ALL[~an_FINAL_SUM_ALL.Cohort.isin(["SSC", "SPARK", "MSSNG"])]
            else:
                exit()
                
            for max_age in list_max_age:
                age_diag_an_FINAL_SUM_ALL  = diag_an_FINAL_SUM_ALL[diag_an_FINAL_SUM_ALL.Age_ZScore_IQ_adj_age_sex_PC1_10_with_online_UKBB <= max_age]                
                for cnv_type in ["DEL", "DUP"]:
                    for max_s in list_max_scores:
                        score_age_diag_an_FINAL_SUM_ALL = age_diag_an_FINAL_SUM_ALL[age_diag_an_FINAL_SUM_ALL["{}_loeuf_inv_full".format(cnv_type)] < max_s]
                        dict_filters = {'ancestry':ancestry, 'diag':diag, 'max_age':max_age, 'max_s':max_s, 'list_name':list_name}
                        tmp_summary = functions_compute_effect_sizes.compute_linear_models(score_age_diag_an_FINAL_SUM_ALL, cnv_type, pheno_score, list_scores, gene_list, list_uniques, other_covariates, dict_filters)
                        model_summary = pd.concat([model_summary, tmp_summary])
        
    print("-- %s seconds --" % (time.time() - start_time), flush=True)
    return (model_summary)

In [None]:

genes_by_individual = functions_compute_effect_sizes.aggregate_ind_cnv_genes(cnv_indd, gene_score_info_overlap_split.loc[:,['CHR', 'START', 'STOP', 'TYPE', 'gene_id']], individual_info)

In [None]:
from functools import partial
partial_func = partial(compute_sum_score_one_gene_list, genes_annotation,genes_by_individual)

pool = Pool(cpu_count)
start_all = time.time()
data_outputs = pool.map(partial_func ,genesets.values.tolist())
pool.close()
pool.join()
print("-- %s seconds all--" % (time.time() - start_all))
df_reconstructed = pd.concat(data_outputs)
df_reconstructed.to_csv(output, sep="\t", index=False)