In [43]:
import json
import pandas as pd
import sys
import subprocess
import os

In [44]:
dirs = [file for file in os.listdir("./data") if "_" in file]
dirs = [file for file in dirs if "RNA-Seq" in os.listdir(f"data/{file}")]
dirs = [file for file in dirs if "ChIP-Seq" in os.listdir(f"data/{file}")]

['Bos_taurus', 'Sus_scrofa', 'Equus_caballus', 'Gallus_gallus', 'Ovis_aries']

In [45]:
def match_chip_rna(spdir, speciesName=""):
    
    if not speciesName:
        speciesName=os.path.basename(spdir)
        
    chip=json.load(open(f"{spdir}/ChIP-Seq/metadata.json"))
    rna=json.load(open(f"{spdir}/RNA-Seq/metadata.json"))
    
    acc_chip = list(chip.keys())
    chip = [dict({"accession": key}, **{k: v for k,v in value.items()}) for key, value in chip.items()]
    rna = [dict({"accession": key}, **{k: v for k,v in value.items()}) for key, value in rna.items()]
    
    chipn=pd.json_normalize(chip)
    rnan=pd.json_normalize(rna)

    rnan.index = rnan["accession"]
    rnan = rnan[rnan["experiment.assayType"].isin(["RNA-seq of total RNA", "RNA-seq of coding RNA", "transcription profiling by high throughput sequencing"])]
    #rnan["cellType.ontologyTerms"] = [c[c.rfind('/')+1:] for c in rnan["cellType.ontologyTerms"]]
    rnan = rnan[["accession", "biosampleId", "experiment.assayType", "cellType.text"]]
    chipn.index = chipn["accession"]
    #chipn["cellType.ontologyTerms"] = [c[c.rfind('/')+1:] for c in chipn["cellType.ontologyTerms"]]
    chipn = chipn[["accession", "biosampleId", "experiment.assayType", "cellType.text"]]
    rnan = rnan[rnan["cellType.text"].isin(chipn["cellType.text"])]

    chip_rna = pd.concat([rnan, chipn])
    #chip_rna.columns
    chip_rna_grouped = chip_rna.groupby(["biosampleId", "cellType.text", "experiment.assayType"]).count().sort_values(["cellType.text", "biosampleId"])

    with open(f"{speciesName}_chip_rna.csv", "w") as f:
        f.write(chip_rna_grouped.to_csv())

    chip_rna = pd.merge(chipn, rnan, on=["biosampleId"], how="inner", suffixes=("_chip", "_rna"))
    chip_rna = chip_rna.drop_duplicates()
    #print(chip_rna.shape, chipn.shape)
    chip_rna["cellType.text"] = chip_rna["cellType.text_chip"]
    chip_rna = chip_rna.drop(["cellType.text_rna", "cellType.text_chip"], axis=1)

    with open(f"{speciesName}_matching_specimen.csv", "w") as f:
        f.write(chip_rna.to_csv(index = False))

    chip_no_match = chipn[~chipn["accession"].isin(list(chip_rna["accession_chip"]))]
    chip_no_match = pd.merge(chip_no_match, rnan, on="cellType.text", how="inner", suffixes=("_chip", "_rna"))
    chip_no_match = chip_no_match.drop_duplicates()

    with open(f"{speciesName}_matching_celltype.csv", "w") as f:
        f.write(chip_no_match.to_csv(index = False))

    c = set(chipn["accession"]) - set(chip_rna["accession_chip"]).union(set(chip_no_match["accession_chip"]))
    # print(c)
    nChip = len(chip)
    nRna = len(rna)
    chipWithSpec = len(set(chip_rna["accession_chip"]))
    chipWithMatch = len(set(chip_no_match["accession_chip"]))
    totalChip = chipn.shape[0]

    print("{} ChIP-Seq and {} RNA-Seq found".format(nChip, nRna))
    print("Chipseq with RNAseq on matching specimens : {}/{} ({} %)".format(chipWithSpec, totalChip, chipWithSpec/totalChip*100))
    print("Chipseq with RNAseq on matching specimens : {}/{} ({} %)".format(chipWithMatch, totalChip, chipWithMatch/totalChip*100))
    print("ChipSeq without matching RNA-Seq on specimen or same cell type: {}/{} ({} %)".format(len(c), totalChip, len(c)/totalChip*100))

In [51]:
names = {"Bos_taurus": "cow", "Equus_caballus": "horse", "Capra_hircus": "goat", "Ovis_aries": "sheep", "Gallus_gallus": "chicken", "Sus_scrofa": "pig"}

In [50]:
if __name__ == "__main__":
    root = sys.argv[1]

    for d in dirs:
        print(d)
        path = f"data/{d}"
        match_chip_rna(path, names[d])

Bos_taurus
210 ChIP-Seq and 408 RNA-Seq found
Chipseq with RNAseq on matching specimens : 147/210 (70.0 %)
Chipseq with RNAseq on matching specimens : 22/210 (10.476190476190476 %)
ChipSeq without matching RNA-Seq on specimen or same cell type: 41/210 (19.523809523809526 %)
Sus_scrofa
106 ChIP-Seq and 384 RNA-Seq found
Chipseq with RNAseq on matching specimens : 78/106 (73.58490566037736 %)
Chipseq with RNAseq on matching specimens : 16/106 (15.09433962264151 %)
ChipSeq without matching RNA-Seq on specimen or same cell type: 12/106 (11.320754716981133 %)
Equus_caballus
88 ChIP-Seq and 66 RNA-Seq found
Chipseq with RNAseq on matching specimens : 82/88 (93.18181818181817 %)
Chipseq with RNAseq on matching specimens : 0/88 (0.0 %)
ChipSeq without matching RNA-Seq on specimen or same cell type: 6/88 (6.8181818181818175 %)
Gallus_gallus
25 ChIP-Seq and 340 RNA-Seq found
Chipseq with RNAseq on matching specimens : 25/25 (100.0 %)
Chipseq with RNAseq on matching specimens : 0/25 (0.0 %)
ChipS

In [54]:
"hello".startswith("h")

True