In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import sys
import os

sys.path.append(os.path.relpath("../../huygens"))
sys.path.append(os.path.relpath("../../galileo"))

import galileo as gal
import huygens as huy

# Load annotations

In [9]:
ccle_genex = pd.read_hdf(
    "../../data/processed/ccle/CCLE_RNAseq_rsem_genes_tpm_20180929.hdf", key="ccle_genex")
ccle_transcripts = pd.read_hdf("../../data/processed/ccle/CCLE_RNAseq_rsem_transcripts_tpm_20180929.hdf",
                        key="ccle_transcripts")
exonusage = pd.read_hdf(
    "../../data/processed/ccle/CCLE_RNAseq_ExonUsageRatio_20180929.hdf", key="exonusage")

ms_prot = pd.read_hdf("../../data/processed/ccle/ms_prot.h5",key="ms_prot")
rppa = pd.read_hdf("../../data/processed/ccle/CCLE_RPPA_20181003.hdf",key="rppa")

In [98]:
msi = pd.read_excel("../data/external/ccle2/41586_2019_1186_MOESM10_ESM.xlsx",sheet_name="MSI calls")
msi = msi.set_index("depMapID")
msi["GDSC_msi"] = msi["GDSC.msi.call"].replace({"MSI-H":True,"MSS/MSI-L":False})
msi["CCLE_msi"] = msi["CCLE.MSI.call"].replace({"inferred-MSI":True,"inferred-MSS":False,"undetermined":np.nan})

msi = msi.dropna(subset=["GDSC_msi","CCLE_msi"],how="all")

msi["MSI"] = msi["GDSC_msi"] | msi["CCLE_msi"]
name_map = dict(zip(msi["CCLE_ID"],msi.index))
name_map["HS294T_SKIN"] = "ACH-000014"

# Compute differences

In [None]:
msi_prot_diffs = gal.mat_mwus_naive(ms_prot,msi["MSI"],pbar=True)
msi_prot_diffs.to_csv("../data/intermediate/msi_prot_diffs.txt",sep="\t")

msi_exon_diffs = gal.mat_mwus_naive(exonusage,msi["MSI"],pbar=True,effect="mean")
msi_exon_diffs.to_csv("../data/intermediate/msi_exon_diffs.txt",sep="\t")

# Mutations

In [4]:
msi_prot_diffs = pd.read_csv("../data/intermediate/msi_prot_diffs.txt",sep="\t")
msi_exon_diffs = pd.read_csv("../data/intermediate/msi_exon_diffs.txt",sep="\t",index_col=0)

## Storage bucket info

In [15]:
ccle_samples = pd.read_csv("../data/raw/fullccle_samples.csv")
wgs_samples = ccle_samples[ccle_samples["datatype"]=="wgs"]
wgs_samples = wgs_samples.sort_values(by=["arxspan_id","version","size"])
wgs_samples = wgs_samples.drop_duplicates(subset=["arxspan_id"],keep="first")

## Gene intervals

In [93]:
msi_exons = msi_exon_diffs.copy()[msi_exon_diffs["qval"]>=4]

msi_exons["gene"] = msi_exons.index.map(lambda x: x.split("_")[-1])
msi_exons["exon_chrom"] = msi_exons.index.map(lambda x: x.split("_")[-4][3:])
msi_exons["exon_start"] = msi_exons.index.map(lambda x: x.split("_")[-3])
msi_exons["exon_end"] = msi_exons.index.map(lambda x: x.split("_")[-2])

msi_exons["exon"] = msi_exons["exon_chrom"] + "_" + msi_exons["exon_start"] + "_" + msi_exons["exon_end"]

msi_exons["exon_start"] = msi_exons["exon_start"].astype(int)
msi_exons["exon_end"] = msi_exons["exon_end"].astype(int)

msi_exons = msi_exons.drop_duplicates(subset=["exon"])

In [94]:
def get_exon_bounds(row,padding=250):
    
    if row["exon_start"] < row["exon_end"]:
        row["bound_start"] = row["exon_start"]-padding
        row["bound_end"] = row["exon_end"]+padding
    
    elif row["exon_start"] > row["exon_end"]:
        row["bound_start"] = row["exon_end"]-padding
        row["bound_end"] = row["exon_start"]+padding
        
    return row
    
msi_exons = msi_exons.apply(get_exon_bounds,axis=1)

In [107]:
msi_exons[["exon_chrom","bound_start","bound_end"]].to_csv("../scripts/MSI_exon_bounds.bed",sep="\t",header=False,index=False)
msi_exons[["exon_chrom","bound_start","bound_end"]].to_csv("../data/raw/MSI_exon_bounds.bed",sep="\t",header=False,index=False)

In [89]:
with open("../scripts/wgs_paths.txt","w") as f:
    for bam_path in list(wgs_samples["internal_bam_filepath"]):
        f.write(bam_path+"\n")
        
with open("../scripts/wgs_ids.txt","w") as f:
    for bam_path in list(wgs_samples["arxspan_id"]):
        f.write(bam_path+"\n")

In [79]:
with open("../scripts/7_fetch-msi-slices.sh", "w") as f:
    for bam_path, ach_id in zip(list(wgs_samples["internal_bam_filepath"]), list(wgs_samples["arxspan_id"])):

        f.write("export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) && samtools view -b -h -M -L ../data/raw/MSI_exon_bounds.bed {} > ../data/raw/WGS_slices/{}.bam\n".format(bam_path, ach_id))