# String Db de interacción de proteínas

  * Origen de los datos: https://string-db.org/cgi/download.pl?sessionId=emyQzkygzVar&species_text=Homo+sapiens
  * Leemos las interacciones entre proteínas de StringDb
  * Las expresamos en nombres de genes
  * Conservamos las interacciones donde al menos un gen de los dos están en GWAS Catalog

In [34]:
%run imports.py

prot_info = pd.read_table("data/9606.protein.info.v11.0.txt.gz")
prot_links = pd.read_table("data/9606.protein.links.v11.0.txt.gz", sep="\s+")
prot_links = prot_links.rename(columns={"protein1": "ensembl_id_1", "protein2": "ensembl_id_2"})

ensemble_id_to_gene_symbol = dict(zip(prot_info.protein_external_id, prot_info.preferred_name))

prot_links["protein_1"] = prot_links.ensembl_id_1.map(ensemble_id_to_gene_symbol)
prot_links["protein_2"] = prot_links.ensembl_id_2.map(ensemble_id_to_gene_symbol)

still_ensembl_ids = (
    prot_links.protein_1.str.contains("ENS") |
    prot_links.protein_2.str.contains("ENS")
)

prot_links = prot_links[~still_ensembl_ids].reset_index(drop=True)

prot_links = prot_links.drop(["ensembl_id_1", "ensembl_id_2"], axis=1)


In [43]:
gwas_cat = pd.read_table("results/gwas_cat.filtrado.tsv.gz")

genes_de_gwas = set(gwas_cat.gen_mapeado_elegido)

prot_links["protein_1_in_gwas"] = prot_links.protein_1.isin(genes_de_gwas)
prot_links["protein_2_in_gwas"] = prot_links.protein_2.isin(genes_de_gwas)

prot_links = prot_links.query("protein_1_in_gwas | protein_2_in_gwas").reset_index(drop=True)

fp = "results/prot_links.stringdb.tsv.gz"
cols_to_export = ["protein_1", "protein_2"]
prot_links[cols_to_export].to_csv(fp, sep="\t")

!zcat $fp | head -n5 | column -t

protein_1  protein_2
0          FKBP4      CLPB
1          FKBP4      CALM2
2          FKBP4      PDZD7
3          FKBP4      ERN1

gzip: stdout: Broken pipe


In [49]:
len(set(prot_links.protein_1) & set(prot_links.protein_2))
prot_links[cols_to_export]

Unnamed: 0,protein_1,protein_2
0,FKBP4,CLPB
1,FKBP4,CALM2
2,FKBP4,PDZD7
3,FKBP4,ERN1
4,FKBP4,KIAA1522
...,...,...
3643577,ZNF518A,PPARG
3643578,ZNF518A,PRIM2
3643579,ZNF518A,AGAP1
3643580,ZNF518A,VPS11
