In [1]:
import numpy as np
import seaborn as sns
import pandas as pd

from collections import Counter, defaultdict

import sys
import os
import re
sys.path.append(os.path.relpath("../helper"))

import name_mappings
import helper_stats
import helper_plots

In [2]:
def format_name(x):
    x_split = x.split(" (")
    return x_split[0] + "_" + x_split[1][:-1]

# Avana

In [3]:
avana = pd.read_csv("raw/depmap/Achilles_gene_effect_avana_19q4.csv",index_col=0).T

avana.index = avana.index.map(lambda x: format_name(x))
avana = avana.T

avana.to_hdf("processed/depmap/avana.hdf",key="avana")

# DRIVE

In [4]:
drive = pd.read_csv("raw/depmap/demeter2-drive_v12-gene-effect.csv",index_col=0).T

drive.index = drive.index.map(lambda x: format_name(x))
drive = drive.T
drive = drive.drop(["GISTT1_GASTROINTESTINAL_TRACT"],axis=0)
drive.index = drive.index.map(lambda x: name_mappings.name_map[x])

drive.to_hdf("processed/depmap/demeter2-drive_v12-gene-effect.hdf",key="drive")

# Achilles

In [7]:
achilles = pd.read_csv("raw/depmap/demeter2-achilles_v13-gene-effect.csv",index_col=0).T

achilles.index = achilles.index.map(lambda x: format_name(x))
achilles = achilles.T
achilles.index = achilles.index.map(lambda x: name_mappings.name_map[x])

achilles.to_hdf("processed/depmap/demeter2-achilles_v13-gene-effect.hdf",key="achilles")

# Mutation classes

## Damaging

In [25]:
damaging_muts = pd.read_csv("raw/depmap/depmap-mutation-calls_v11-damaging-mutation.csv",index_col=0).T

damaging_muts.index = damaging_muts.index.map(lambda x: format_name(x))
damaging_muts = damaging_muts.T

damaging_muts.to_hdf("processed/depmap/depmap-mutation-calls_v11-damaging-mutation.hdf",key="damaging_muts")

## Hotspot

In [27]:
hs_muts = pd.read_csv("raw/depmap/depmap-mutation-calls_v11-hotspot-mutation.csv",index_col=0).T

hs_muts.index = hs_muts.index.map(lambda x: format_name(x))
hs_muts = hs_muts.T

hs_muts.to_hdf("processed/depmap/depmap-mutation-calls_v11-hotspot-mutation.hdf",key="hs_muts")

## Other

In [28]:
other_muts = pd.read_csv("raw/depmap/depmap-mutation-calls_v11-other-mutation.csv",index_col=0).T

other_muts.index = other_muts.index.map(lambda x: format_name(x))
other_muts = other_muts.T

other_muts.to_hdf("processed/depmap/depmap-mutation-calls_v11-other-mutation.hdf",key="other_muts")

# Drug responses

## Primary screen

In [37]:
primary_col_meta = pd.read_csv(
    "raw/depmap/primary-screen-public-tentative_v4-primary-merged-replicate-col-meta.csv")

primary_row_meta = pd.read_csv(
    "raw/depmap/primary-screen-public-tentative_v4-primary-merged-row-meta.csv")

primary_name_map = dict(zip(primary_row_meta["feature_id"],primary_row_meta["depmap_id"]))

primary_median = pd.read_csv(
    "raw/depmap/primary-screen-public-tentative_v4-primary-merged-median-lfcvc-cb.csv",index_col=0)
primary_mad = pd.read_csv(
    "raw/depmap/primary-screen-public-tentative_v4-primary-merged-mad-lfcvc-cb.csv",index_col=0)

primary_median.index = primary_median.index.map(lambda x: primary_name_map[x])
primary_mad.index = primary_mad.index.map(lambda x: primary_name_map[x])

In [39]:
primary_median.to_hdf(
    "processed/depmap/primary-screen-public-tentative_v4-primary-merged-median-lfcvc-cb.hdf", key="primary_median")

primary_mad.to_hdf(
    "processed/depmap/primary-screen-public-tentative_v4-primary-merged-mad-lfcvc-cb.hdf", key="primary_mad")


## Secondary screen

In [40]:
secondary_col_meta = pd.read_csv(
    "raw/depmap/secondary-screen-public-tentative_v3-secondary-merged-replicate-col-meta.csv")

secondary_row_meta = pd.read_csv(
    "raw/depmap/secondary-screen-public-tentative_v3-secondary-merged-row-meta.csv")

secondary_name_map = dict(zip(secondary_row_meta["feature_id"],secondary_row_meta["depmap_id"]))

secondary_median = pd.read_csv(
    "raw/depmap/secondary-screen-public-tentative_v3-secondary-merged-median-lfcvc-cb.csv",index_col=0)
secondary_mad = pd.read_csv(
    "raw/depmap/secondary-screen-public-tentative_v3-secondary-merged-mad-lfcvc-cb.csv",index_col=0)

secondary_median.index = secondary_median.index.map(lambda x: secondary_name_map[x])
secondary_mad.index = secondary_mad.index.map(lambda x: secondary_name_map[x])

  interactivity=interactivity, compiler=compiler, result=result)


In [41]:
secondary_median.to_hdf(
    "processed/depmap/secondary-screen-public-tentative_v3-secondary-merged-median-lfcvc-cb.hdf", key="secondary_median")

secondary_mad.to_hdf(
    "processed/depmap/secondary-screen-public-tentative_v3-secondary-merged-mad-lfcvc-cb.hdf", key="secondary_mad")


# Copy number

In [3]:
copynumber = pd.read_csv("raw/depmap/CCLE_gene_cn_19q4_public.csv",index_col=0)

copynumber.to_hdf("processed/depmap/CCLE_gene_cn_19q4_public.hdf",key="copynumber")

# MSI

In [8]:
is_msi = pd.read_csv("raw/depmap/CCLE_MSI.csv",index_col=1)
is_msi = is_msi[is_msi["CCLE.MSI.call"].isin(["inferred-MSI","inferred-MSS"])]
is_msi["MSI"] = is_msi["CCLE.MSI.call"] == "inferred-MSI"

In [10]:
is_msi.to_hdf("processed/depmap/CCLE_MSI.h5",key="is_msi")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['CCLE_ID', 'GDSC.msi.call', 'CCLE.MSI.call']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


# Cell line metadata

In [13]:
# metadata = pd.read_csv("raw/depmap/sample_info.csv",index_col=0)
metadata = pd.read_csv("raw/depmap/sample-info-19q2_v2-achiles-sample-info-full.csv",index_col=0)

metadata["display_disease"] = metadata["disease"].apply(lambda x: x.replace("_"," ").capitalize())
metadata["display_disease"] = metadata["display_disease"].apply(lambda x: "Unknown" if x == " " else x)

metadata.to_csv("processed/depmap/sample_info.csv")

# DepMap gene expression

In [6]:
depmap_genex = pd.read_csv("raw/depmap/CCLE_expression.csv",index_col=0)

depmap_genex.to_hdf("processed/depmap/CCLE_expression.h5",key="depmap_genex")