In [None]:
import pandas as pd
import bio_networks as bionets
import obtain_edges_scores as scores

### Selection of genetic variants

Source: DisGeNET

Diseases:
* C0524851 Neurodegenerative Disorder
* C0262424 CNS degeneration
* C0270715 Degenerative disease of the central nervous system
* C0002395 Alzheimer’s Disease
* C0338451 Frontotemporal dementia
* C0030567 Parkinson’s Disease
* C0393570 Corticobasal degeneration
* C0011269 Vascular dementia
* C0282513 Primary progressive aphasia
* C0752347 Lewy body dementia

In [None]:
vdas = pd.read_csv('data/VDAs/VDAs.tsv', sep='\t')

In [None]:
vars_vdas  = set(vdas['Variant'].values.tolist())
genes_set = set(vdas['Gene'].values.tolist())
genes_vdas = []
for gene_string in genes_set:
    gene_string = str(gene_string)
    if gene_string != 'nan':
        if ';' in gene_string: 
            genes_string = gene_string.split(';')
            for g in genes_string:
                genes_vdas.append(g)
        else:
            genes_vdas.append(gene_string)

genes_vdas = set(genes_vdas)
        
n_vars  = len(vars_vdas)
n_genes = len(genes_vdas)

print('Number variants:', n_vars)
print('Number genes:', n_genes)

In [None]:
exonic_vars = ['downstream gene variant', 'frameshift variant', 'missense variant',
               'splice acceptor variant', 'splice donor variant',
               'splice region variant', 'start lost', 'stop gained', 'stop lost',
               'synonymous variant']

vdas_exonic = vdas.loc[vdas['Consequence'].isin(exonic_vars)]

vars_vdas_exonic = set(vdas_exonic['Variant'].values.tolist())
genes_set_exonic = set(vdas_exonic['Gene'].values.tolist())
genes_vdas_exonic = []

for gene_string in genes_set_exonic:
    gene_string = str(gene_string)
    if gene_string != 'nan':
        if ';' in gene_string: 
            genes_string = gene_string.split(';')
            for g in genes_string:
                genes_vdas_exonic.append(g)
        else:
            genes_vdas_exonic.append(gene_string)

genes_vdas_exonic = set(genes_vdas_exonic)

n_vars_exonic  = len(vars_vdas_exonic)
n_genes_exonic = len(genes_vdas_exonic)

print('Number variants:', n_vars_exonic)
print('Number genes:', n_genes_exonic)

In [None]:
with open(f'data/associated_variants_{v}.txt', 'w') as output:
    for line in vars_vdas:
        output.write(f"{line}\n")
    
with open(f'data/associated_variants_exonic_{v}.txt', 'w') as output:
    for line in vars_vdas_exonic:
        output.write(f"{line}\n")

### Obtain variants in ADNI cohort

```bash extract_variants.sh```

In [None]:
col_file = open(f'data/selected_variants_{v}/columns.txt', 'r')
col_names = col_file.read().split('\n')
col_file.close()
col_names = [col.upper() for col in col_names]

dfs_chr = []
for i in range(1, 24):
    tmp_chr = pd.read_csv(f'data/selected_variants_{v}/chr{i}.tsv', sep=' ', names=col_names, index_col='ID')
    tmp_chr = tmp_chr.drop(columns=['CHROM', 'POS', 'NONE'])
    dfs_chr.append(tmp_chr)
    
genotypes_data = pd.concat(dfs_chr)
print(genotypes_data.shape[0])

genotypes_data.index = [name if duplicated == False else name + '_1' for duplicated, name in zip(genotypes_data.index.duplicated(), genotypes_data.index)]
print(genotypes_data.shape[0])

In [None]:
genes_info = vdas.set_index('Variant') # information about variants such as gene
genes_info = genes_info[~genes_info.index.duplicated(keep='first')]

genotypes_genes = pd.concat([genotypes_data, genes_info['Gene']], axis=1, join='inner') # concat genotypes and info
genotypes_genes = genotypes_genes.assign(Gene=genotypes_genes['Gene'].str.split(r';')).explode('Gene')

genes_found = set(genotypes_genes['Gene'].values.tolist())

n_vars_found  = genotypes_genes.shape[0]
n_genes_found = len(genes_found)
            
print('Number variants:', n_vars_found) # salen + variantes que en ADNI al hacer el explode Gene
print('Number genes:', n_genes_found)

In [None]:
genes_info_exonic = vdas_exonic.set_index('Variant')
genes_info_exonic = genes_info_exonic[~genes_info_exonic.index.duplicated(keep='first')]
genotypes_genes_exonic = pd.concat([genotypes_data, genes_info_exonic['Gene']], axis=1, join='inner')

genotypes_genes_exonic = genotypes_genes_exonic.assign(Gene=genotypes_genes_exonic['Gene'].str.split(r';')).explode('Gene')

genes_found_exonic = set(genotypes_genes_exonic['Gene'].values.tolist())

n_vars_found_exonic  = genotypes_genes_exonic.shape[0]
n_genes_found_exonic = len(genes_found_exonic)
            
print('Number variants:', n_vars_found_exonic)
print('Number genes:', n_genes_found_exonic)

### Iteration with the PPI network

In [None]:
print('Exonic variants')
ppi_found_exonic     = bionets.get_snap(genes_found_exonic, False)
ppi_bcc_found_exonic = bionets.get_snap(genes_found_exonic, True)

### Create PPI scores dataset

In [None]:
ppi_net = ppi_found_exonic

genotypes_genes.rename(columns={'Gene': 'SYMBOL'}, inplace=True)
genotypes_genes.replace({'./.':0, '0/0':0}, inplace=True)
genotypes_genes.replace({'0/1':1, '1/0':1, '1/1':2, '0/2':1, '2/0':1, '1/2':2, '2/1':2, '2/2':2}, inplace=True)
genotypes_genes.to_csv(f'results/processed_variants_ADNI_WGS.csv')

edges_scores_wgs = scores.obtain_edges_scores(ppi_net, genotypes_genes)
edges_scores_wgs.to_csv(f'results/edges_scores.csv')
print('Edges:', edges_scores_wgs.shape[0])