Acá generamos el dataframe de base en el que se basará la red.

Descarga de los datos: https://www.ebi.ac.uk/gwas/docs/file-downloads, "All associations v1.0.2 - with added ontology".

Me quedo sólo con algunas columnas del dataset de asociaciones, les pongo nombres amigables y
les asigno una categoría fenotípica a cada carácter (éstas vienen de un segundo archivo).

Escribo el resultado a un nuevo dataset para basar todo el trabajo en ese.

In [64]:
%run imports.py

## Carga y limpieza del dataset

### Asociaciones variante-fenotipo

In [65]:
path = f"{data_dir}/gwas_catalog_v1.0.2-associations_e98_r2020-05-03.tsv.gz"
df_assoc_full = pd.read_table(path, low_memory=False)
print("Columnas disponibles: ", " | ".join(df_assoc_full.columns))

columnas = {
    "DISEASE/TRAIT": "fenotipo",
    "CHR_ID": "crom",
    "CHR_POS": "pos",
    "REPORTED GENE(S)": "genes_reportados",
    "MAPPED_GENE": "genes_mapeados",
    "STRONGEST SNP-RISK ALLELE": "alelo_riesgo",
    "CONTEXT": "contexto",
    "INTERGENIC": "es_intergenico",
    "RISK ALLELE FREQUENCY": "frec_alelo_riesgo",
    "P-VALUE": "p_value",
    "OR or BETA": "OR_or_beta",
    "95% CI (TEXT)": "conf_int_95"
}
df_assoc = df_assoc_full[columnas].rename(columns=columnas)
df_assoc

Columnas disponibles:  DATE ADDED TO CATALOG | PUBMEDID | FIRST AUTHOR | DATE | JOURNAL | LINK | STUDY | DISEASE/TRAIT | INITIAL SAMPLE SIZE | REPLICATION SAMPLE SIZE | REGION | CHR_ID | CHR_POS | REPORTED GENE(S) | MAPPED_GENE | UPSTREAM_GENE_ID | DOWNSTREAM_GENE_ID | SNP_GENE_IDS | UPSTREAM_GENE_DISTANCE | DOWNSTREAM_GENE_DISTANCE | STRONGEST SNP-RISK ALLELE | SNPS | MERGED | SNP_ID_CURRENT | CONTEXT | INTERGENIC | RISK ALLELE FREQUENCY | P-VALUE | PVALUE_MLOG | P-VALUE (TEXT) | OR or BETA | 95% CI (TEXT) | PLATFORM [SNPS PASSING QC] | CNV | MAPPED_TRAIT | MAPPED_TRAIT_URI | STUDY ACCESSION | GENOTYPING TECHNOLOGY


Unnamed: 0,fenotipo,crom,pos,genes_reportados,genes_mapeados,alelo_riesgo,contexto,es_intergenico,frec_alelo_riesgo,p_value,OR_or_beta,conf_int_95
0,Crohn's disease,10,6060049,IL2RA,IL2RA,rs12722489-C,intron_variant,0.0,0.852,3E-9,1.11,[1.05-1.16]
1,Crohn's disease,16,28479196,"IL27, SH2B1, EIF3C, LAT, CD19","CLN3, AC138894.1",rs151181-G,intron_variant,0.0,0.386,2E-11,1.07,[1.03-1.12]
2,Crohn's disease,1,67240275,IL23R,IL23R,rs11209026-G,missense_variant,0.0,0.932,1E-64,2.66,[2.36-3.00]
3,Crohn's disease,2,102437989,"IL12RL2, IL18R1, IL1RL1, IL18RAP",IL18RAP,rs2058660-G,intron_variant,0.0,0.231,2E-12,1.19,[1.14-1.26]
4,Crohn's disease,5,159360377,IL12B,AC008691.1,rs6556412-A,intron_variant,0.0,0.332,5E-14,1.18,[1.13-1.24]
...,...,...,...,...,...,...,...,...,...,...,...,...
185719,Lung adenocarcinoma,6,31652743,"APOM, BAT3","APOM, APOM, APOM, APOM, APOM, APOM, APOM",rs3117582-C,intron_variant,0.0,0.10,5E-12,1.22,[1.15-1.29]
185720,Lung adenocarcinoma,15,78601997,"CHRNA5, CHRNA3",CHRNA3,rs1051730-T,synonymous_variant,0.0,0.35,2E-51,1.31,[1.27-1.36]
185721,Lung adenocarcinoma,5,1342599,CLPTM1L,CLPTM1L,rs31489-C,intron_variant,0.0,0.59,2E-10,1.12,[1.09-1.16]
185722,Lung adenocarcinoma,6,28808340,TRNAA-UGC,NOP56P1 - AL662890.1,rs4324798-A,intergenic_variant,1.0,0.09,2E-8,1.16,[1.09-1.24]


In [66]:
from more_itertools import unique_everseen

def eliminar_items_repetidos(items_str, sep):
    if pd.isna(items_str):
        return
    
    items_unicos = unique_everseen(items_str.split(sep))
    return " | ".join(item.strip() for item in items_unicos)

# Hay valores como "APOM, APOM, APOM" en los genes, también "GEN1; GEN1". Limpiamos
# y unificamos el separador como " | ":

for sep in [",", ";"]:
    df_assoc["genes_mapeados"] = df_assoc.genes_mapeados.apply(eliminar_items_repetidos, sep=sep)
    df_assoc["genes_reportados"] = df_assoc.genes_reportados.apply(eliminar_items_repetidos, sep=sep)

In [67]:
d = {"foo": 2, "bar": 4, "baz": 1}
# max(d, key=itemgetter(1))
pd.Series(d).idxmax()

'bar'

In [68]:
from more_itertools import flatten
from collections import Counter

genes_mapeados_por_alelo = df_assoc.genes_mapeados.dropna().apply(lambda genes_str: genes_str.split(" | "))
genes_mapeados_frecuencia = pd.Series(flatten(genes_mapeados_por_alelo)).value_counts()
genes_ordenados_por_frecuencia = genes_mapeados_frecuencia.index

genes_mapeados_frecuencia["SARM1"]

1314

In [69]:
def elegir_un_gen(genes_str, sep):
    """Quita genes que se ven poco interesantes por razones variadas y de los restantes
       elige el más frecuente del dataset. Si hay empates elige el primero."""
    if pd.isna(genes_str):
        return
   
    genes = genes_str.split(sep)
    genes_interesantes = [
        gen for gen in genes if
        "." not in gen and # probablemente ids no interesantes
        " - " not in gen and # alelo intergénico reportado entre GEN1 - GEN2
        "-" not in gen and # genes antisense tipo FOOBAR-AS1
        "orf" not in gen # ORFs sin función identificada
    ]
    
    if not genes_interesantes:
        return
    
    frecuencias_por_gen = pd.Series({gen: genes_mapeados_frecuencia[gen]
                                     for gen in genes_interesantes}, dtype=int)
    return frecuencias_por_gen.idxmax()
    

df_assoc["gen_mapeado_elegido"] = df_assoc.genes_mapeados.progress_apply(elegir_un_gen, sep=" | ")

100%|██████████| 185724/185724 [01:03<00:00, 2912.22it/s]


In [71]:
df_assoc

Unnamed: 0,fenotipo,crom,pos,genes_reportados,genes_mapeados,alelo_riesgo,contexto,es_intergenico,frec_alelo_riesgo,p_value,OR_or_beta,conf_int_95,gen_mapeado_elegido
0,Crohn's disease,10,6060049,IL2RA,IL2RA,rs12722489-C,intron_variant,0.0,0.852,3E-9,1.11,[1.05-1.16],IL2RA
1,Crohn's disease,16,28479196,IL27 | SH2B1 | EIF3C | LAT | CD19,CLN3 | AC138894.1,rs151181-G,intron_variant,0.0,0.386,2E-11,1.07,[1.03-1.12],CLN3
2,Crohn's disease,1,67240275,IL23R,IL23R,rs11209026-G,missense_variant,0.0,0.932,1E-64,2.66,[2.36-3.00],IL23R
3,Crohn's disease,2,102437989,IL12RL2 | IL18R1 | IL1RL1 | IL18RAP,IL18RAP,rs2058660-G,intron_variant,0.0,0.231,2E-12,1.19,[1.14-1.26],IL18RAP
4,Crohn's disease,5,159360377,IL12B,AC008691.1,rs6556412-A,intron_variant,0.0,0.332,5E-14,1.18,[1.13-1.24],
...,...,...,...,...,...,...,...,...,...,...,...,...,...
185719,Lung adenocarcinoma,6,31652743,APOM | BAT3,APOM | APOM,rs3117582-C,intron_variant,0.0,0.10,5E-12,1.22,[1.15-1.29],APOM
185720,Lung adenocarcinoma,15,78601997,CHRNA5 | CHRNA3,CHRNA3,rs1051730-T,synonymous_variant,0.0,0.35,2E-51,1.31,[1.27-1.36],CHRNA3
185721,Lung adenocarcinoma,5,1342599,CLPTM1L,CLPTM1L,rs31489-C,intron_variant,0.0,0.59,2E-10,1.12,[1.09-1.16],CLPTM1L
185722,Lung adenocarcinoma,6,28808340,TRNAA-UGC,NOP56P1 - AL662890.1,rs4324798-A,intergenic_variant,1.0,0.09,2E-8,1.16,[1.09-1.24],


### Categorías más generales de los fenotipos

In [72]:
path = f"{data_dir}/gwas_catalog_trait-mappings_r2020-05-17.tsv.gz"
trait_mappings = pd.read_table(path)
trait_mappings.head()

Unnamed: 0,Disease trait,EFO term,EFO URI,Parent term,Parent URI
0,Clubfoot,familial clubfoot with or without associated lower limb anomalies,http://www.orpha.net/ORDO/Orphanet_199315,Other disease,http://www.ebi.ac.uk/efo/EFO_0000408
1,Alcohol dependence or heroin dependence or methamphetamine dependence,methamphetamine dependence,http://www.ebi.ac.uk/efo/EFO_0004701,Neurological disorder,http://www.ebi.ac.uk/efo/EFO_0000618
2,Methamphetamine dependence,methamphetamine dependence,http://www.ebi.ac.uk/efo/EFO_0004701,Neurological disorder,http://www.ebi.ac.uk/efo/EFO_0000618
3,Hypothyroidism,hypothyroidism,http://www.ebi.ac.uk/efo/EFO_0004705,Other disease,http://www.ebi.ac.uk/efo/EFO_0000408
4,Age at menopause,age at menopause,http://www.ebi.ac.uk/efo/EFO_0004704,Other measurement,http://www.ebi.ac.uk/efo/EFO_0001444


In [73]:
fenotipo_a_categoria = dict(zip(trait_mappings["Disease trait"], trait_mappings["Parent term"]))
fenotipo_a_EFO_URI = dict(zip(trait_mappings["Disease trait"], trait_mappings["EFO URI"]))

categorias = df_assoc.fenotipo.map(fenotipo_a_categoria)

df_assoc["EFO_URI"] = df_assoc.fenotipo.map(fenotipo_a_EFO_URI)
df_assoc.insert(0, "categoria_fenotipo", categorias)

df_assoc["es_intergenico"] = df_assoc.es_intergenico.astype(bool)

df_assoc

Unnamed: 0,categoria_fenotipo,fenotipo,crom,pos,genes_reportados,genes_mapeados,alelo_riesgo,contexto,es_intergenico,frec_alelo_riesgo,p_value,OR_or_beta,conf_int_95,gen_mapeado_elegido,EFO_URI
0,Digestive system disorder,Crohn's disease,10,6060049,IL2RA,IL2RA,rs12722489-C,intron_variant,False,0.852,3E-9,1.11,[1.05-1.16],IL2RA,http://www.ebi.ac.uk/efo/EFO_0000384
1,Digestive system disorder,Crohn's disease,16,28479196,IL27 | SH2B1 | EIF3C | LAT | CD19,CLN3 | AC138894.1,rs151181-G,intron_variant,False,0.386,2E-11,1.07,[1.03-1.12],CLN3,http://www.ebi.ac.uk/efo/EFO_0000384
2,Digestive system disorder,Crohn's disease,1,67240275,IL23R,IL23R,rs11209026-G,missense_variant,False,0.932,1E-64,2.66,[2.36-3.00],IL23R,http://www.ebi.ac.uk/efo/EFO_0000384
3,Digestive system disorder,Crohn's disease,2,102437989,IL12RL2 | IL18R1 | IL1RL1 | IL18RAP,IL18RAP,rs2058660-G,intron_variant,False,0.231,2E-12,1.19,[1.14-1.26],IL18RAP,http://www.ebi.ac.uk/efo/EFO_0000384
4,Digestive system disorder,Crohn's disease,5,159360377,IL12B,AC008691.1,rs6556412-A,intron_variant,False,0.332,5E-14,1.18,[1.13-1.24],,http://www.ebi.ac.uk/efo/EFO_0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185719,Cancer,Lung adenocarcinoma,6,31652743,APOM | BAT3,APOM | APOM,rs3117582-C,intron_variant,False,0.10,5E-12,1.22,[1.15-1.29],APOM,http://www.ebi.ac.uk/efo/EFO_0000571
185720,Cancer,Lung adenocarcinoma,15,78601997,CHRNA5 | CHRNA3,CHRNA3,rs1051730-T,synonymous_variant,False,0.35,2E-51,1.31,[1.27-1.36],CHRNA3,http://www.ebi.ac.uk/efo/EFO_0000571
185721,Cancer,Lung adenocarcinoma,5,1342599,CLPTM1L,CLPTM1L,rs31489-C,intron_variant,False,0.59,2E-10,1.12,[1.09-1.16],CLPTM1L,http://www.ebi.ac.uk/efo/EFO_0000571
185722,Cancer,Lung adenocarcinoma,6,28808340,TRNAA-UGC,NOP56P1 - AL662890.1,rs4324798-A,intergenic_variant,True,0.09,2E-8,1.16,[1.09-1.24],,http://www.ebi.ac.uk/efo/EFO_0000571


In [74]:
path = f"{results_dir}/GwasCat_associations.r2020-05-03.parsed.csv.gz"
print(path)
df_assoc.to_csv(path, index=False)

results/GwasCat_associations.r2020-05-03.parsed.csv.gz


In [75]:
! zcat $path | head | column -s"," -t


gzip: stdout: Broken pipe
categoria_fenotipo         fenotipo         crom  pos        genes_reportados                     genes_mapeados           alelo_riesgo  contexto                   es_intergenico  frec_alelo_riesgo  p_value  OR_or_beta  conf_int_95  gen_mapeado_elegido                   EFO_URI
Digestive system disorder  Crohn's disease  10    6060049    IL2RA                                IL2RA                    rs12722489-C  intron_variant             False           0.852              3E-9     1.11        [1.05-1.16]  IL2RA                                 http://www.ebi.ac.uk/efo/EFO_0000384
Digestive system disorder  Crohn's disease  16    28479196   IL27 | SH2B1 | EIF3C | LAT | CD19    CLN3 | AC138894.1        rs151181-G    intron_variant             False           0.386              2E-11    1.07        [1.03-1.12]  CLN3                                  http://www.ebi.ac.uk/efo/EFO_0000384
Digestive system disorder  Crohn's disease  1     67240275   IL23R            