In [54]:
from pathlib import Path

import numpy as np
import pandas as pd

PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data"

In [58]:
res = pd.DataFrame()
for i in ["nci", "gdsc1", "gdsc2", "ctrp"]:
    flist = !ls ../data/gdsc1_data/ | grep exp
    tmp = pd.concat([pd.read_csv(f"../data/{i}_data/{l}", index_col=0) for l in flist])

    s = pd.Series(tmp.values.flatten())

    # 統計量をまとめて表示
    summary = pd.DataFrame(
        {"mean": [s.mean()], "var": [s.var()], "max": [s.max()], "min": [s.min()]}
    )
    res = pd.concat([res, summary])
res.index = ["nci", "gdsc1", "gdsc2", "ctrp"]
res

Unnamed: 0,mean,var,max,min
nci,6.001992,3.730668,15.264,1.438
gdsc1,4.869682,4.687176,14.327684,2.098777
gdsc2,4.869682,4.687176,14.327684,2.098777
ctrp,5.800779,3.931315,14.694787,2.643324


In [55]:
res = pd.DataFrame()
for i in ["nci", "gdsc1", "gdsc2", "ctrp"]:
    if i == "nci":
        tmp = pd.read_csv(f"../data/{i}_data/met.csv", index_col=0)
    else:
        flist = !ls ../data/gdsc1_data/ | grep met
        tmp = pd.concat(
            [pd.read_csv(f"../data/{i}_data/{l}", index_col=0) for l in flist]
        )

    s = pd.Series(tmp.values.flatten())

    # 統計量をまとめて表示
    summary = pd.DataFrame(
        {"mean": [s.mean()], "var": [s.var()], "max": [s.max()], "min": [s.min()]}
    )
    res = pd.concat([res, summary])
res.index = ["nci", "gdsc1", "gdsc2", "ctrp"]
res

Unnamed: 0,mean,var,max,min
nci,0.277257,0.078327,0.977,0.021
gdsc1,0.296364,0.095969,1.0,0.0
gdsc2,0.296364,0.095969,1.0,0.0
ctrp,0.391387,0.10618,1.0,0.0


In [56]:
res = pd.DataFrame()
for i in ["nci", "gdsc1", "gdsc2", "ctrp"]:
    if i == "nci":
        tmp = pd.read_csv(f"../data/{i}_data/cop.csv", index_col=0)
    elif i == "ctrp":
        flist = !ls ../data/gdsc1_data/ | grep cop
        tmp = pd.concat(
            [
                pd.read_csv(f"../data/ctrp_data/{l}", index_col=1).drop("Gene", axis=1)
                for l in flist
            ]
        )
    else:
        flist = !ls ../data/gdsc1_data/ | grep cop
        tmp = pd.concat(
            [pd.read_csv(f"../data/{i}_data/{l}", index_col=0) for l in flist]
        )

    s = pd.Series(tmp.values.flatten())

    # 統計量をまとめて表示
    summary = pd.DataFrame(
        {"mean": [s.mean()], "var": [s.var()], "max": [s.max()], "min": [s.min()]}
    )
    res = pd.concat([res, summary])
res.index = ["nci", "gdsc1", "gdsc2", "ctrp"]
res

Unnamed: 0,mean,var,max,min
nci,0.01817,0.055442,8.686,-4.794
gdsc1,0.278598,1.469246,2.906891,-10.965784
gdsc2,0.278598,1.469246,2.906891,-10.965784
ctrp,0.391387,0.10618,1.0,0.0


In [57]:
res = pd.DataFrame()
for i in ["nci", "gdsc1", "gdsc2", "ctrp"]:
    tmp = pd.read_csv(f"../data/{i}_data/mut.csv", index_col=0)
    s = pd.Series(tmp.values.flatten())

    # 統計量をまとめて表示
    summary = pd.DataFrame(
        {"mean": [s.mean()], "var": [s.var()], "max": [s.max()], "min": [s.min()]}
    )
    res = pd.concat([res, summary])
res.index = ["nci", "gdsc1", "gdsc2", "ctrp"]
res

Unnamed: 0,mean,var,max,min
nci,1.801504,106.163834,100.0,0.0
gdsc1,0.005361,0.003329,1.0,0.0
gdsc2,0.005361,0.003329,1.0,0.0
ctrp,0.016675,0.012062,1.0,0.0


# NCI

In [127]:
# データ読み込み
path = DATA_DIR / "nci_data"

# 各データセット
drugAct = pd.read_csv(path / "drugAct.csv", index_col=0).T
gene_exp_files = sorted(path.glob("gene_exp_part*.csv.gz"))
exprs = pd.concat([pd.read_csv(f, index_col=0) for f in gene_exp_files]).T
exprs = exprs[exprs.notna().sum(axis=1) != 0]
mut = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("mut*"))]).T
met = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("met*"))]).T
cop = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("cop*"))]).T

# 共通細胞株
common_cells = (
    set(drugAct.index)
    & set(exprs.index)
    & set(mut.index)
    & set(met.index)
    & set(cop.index)
)
np.save("nci_cells.npy", np.array(list(common_cells)))  # 保存


# 統計取得関数
def get_stats(name, df):
    return {
        "Dataset": name,
        "Num Cells": int(df.shape[0]),
        "Num Features": int(df.shape[1]),
        "Missing Values": int(df.isna().sum().sum()),
    }


# 各データセットの統計
stats = [
    get_stats("Gene Expression", exprs),
    get_stats("Mutation", mut),
    get_stats("Copy Number Variation", cop),
    get_stats("Methylation", met),
    get_stats("Drug Activity", drugAct),
]

# 共通細胞株数の情報（NAを使う）
stats.append(
    {
        "Dataset": "Common Cell Lines",
        "Num Cells": len(common_cells),
        "Num Features": pd.NA,
        "Missing Values": pd.NA,
    }
)

# DataFrame化し、型を明示
stats_df = pd.DataFrame(stats).astype(
    {
        "Dataset": "string",
        "Num Cells": "int",
        "Num Features": "Int64",
        "Missing Values": "Int64",
    }
)
stats_df

Unnamed: 0,Dataset,Num Cells,Num Features,Missing Values
0,Gene Expression,59,23059.0,138.0
1,Mutation,60,9307.0,0.0
2,Copy Number Variation,60,23232.0,2661.0
3,Methylation,60,17553.0,824.0
4,Drug Activity,60,977.0,44049.0
5,Common Cell Lines,59,,


In [128]:
set(drugAct.index) - set(exprs.index)

{'SF_539'}

# GDSC1

In [129]:
# データ読み込み
path = DATA_DIR / "gdsc1_data"

# 各データセット
drugAct = pd.read_csv("../../PharmacoDB2binary/data/gdsc1.csv")
drugAct = drugAct.pivot_table(index="cell_line", columns="drug", values="IC50")
drugAct.index = list(drugAct.index)
drugAct.columns = list(drugAct.columns)
conv = dict(pd.read_csv("pharmacodb_GDSC1.csv").dropna()[["pdb", "rcm"]].values)
renamed = drugAct[drugAct.index.isin(list(conv.keys()))]
renamed.index = [conv[i] for i in renamed.index]
gene_exp_files = sorted(path.glob("gene_exp_part*.csv.gz"))
exprs = pd.concat([pd.read_csv(f, index_col=0).T for f in gene_exp_files], axis=1)
exprs = exprs[exprs.notna().sum(axis=1) != 0]
mut = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("mut*"))]).T
met = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("met*"))]).T
cop = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("cop*"))]).T

# 共通細胞株
common_cells = (
    set(renamed.index)
    & set(exprs.index)
    & set(mut.index)
    & set(met.index)
    & set(cop.index)
)
np.save("gdsc1_cells.npy", np.array(list(common_cells)))  # 保存


# 統計取得関数
def get_stats(name, df):
    return {
        "Dataset": name,
        "Num Cells": int(df.shape[0]),
        "Num Features": int(df.shape[1]),
        "Missing Values": int(df.isna().sum().sum()),
    }


# 各データセットの統計
stats = [
    get_stats("Gene Expression", exprs),
    get_stats("Mutation", mut),
    get_stats("Methylation", met),
    get_stats("Copy Number Variation", cop),
    get_stats("Original Drug Activity", drugAct),
    get_stats("Renamed Drug Activity", renamed),
]

# 共通細胞株数の情報（NAを使う）
stats.append(
    {
        "Dataset": "Common Cell Lines",
        "Num Cells": len(common_cells),
        "Num Features": pd.NA,
        "Missing Values": pd.NA,
    }
)

stats.append(
    {
        "Dataset": "Duplicated Cell Lines",
        "Num Cells": sum(renamed.index.duplicated()),
        "Num Features": pd.NA,
        "Missing Values": pd.NA,
    }
)

# DataFrame化し、型を明示
stats_df = pd.DataFrame(stats).astype(
    {
        "Dataset": "string",
        "Num Cells": "int",
        "Num Features": "Int64",  # nullable integer
        "Missing Values": "Int64",
    }
)
stats_df

Unnamed: 0,Dataset,Num Cells,Num Features,Missing Values
0,Gene Expression,987,19562.0,18974.0
1,Mutation,1084,18099.0,1321227.0
2,Methylation,1084,19864.0,2174189.0
3,Copy Number Variation,1084,24502.0,3009817.0
4,Original Drug Activity,985,339.0,164907.0
5,Renamed Drug Activity,984,339.0,164803.0
6,Common Cell Lines,954,,
7,Duplicated Cell Lines,3,,


In [130]:
np.array(set(renamed.index) - common_cells)

array({'NCI-H2171', 'TALL-1', 'NCI-H128', 'HCC-33', 'NCI-H508', 'OVCA433', 'Hs-939-T', 'YMB-1-E', 'OMC-1', 'CP67-MEL', 'HT', 'NCI-H2081', 'KP-N-RT-BM-1', 'Caov-3', 'KMH-2', 'VMRC-MELG', 'BC-3', 'TMK-1', 'NCI-H460', 'MY-M12', 'KP-N-YS', 'MC-IXC', 'MHH-ES-1', 'VMRC-RCW', 'NK-92MI', 'MDA-MB-175-VII', 'NCI-H1437'},
      dtype=object)

In [131]:
set(drugAct.index) - set(conv.keys())

{'NTERA-2'}

In [132]:
renamed.index[renamed.index.duplicated()]

Index(['KMH-2', 'PC-3', 'TT'], dtype='object')

In [133]:
renamed[renamed.index.isin(renamed.index[renamed.index.duplicated()])]

Unnamed: 0,(-)-Parthenolide,(5Z)-7-Oxozeaenol,5-Fluorouracil,681640,A-443654,A-484954,A-770041,A-83-01,ACY-1215,AGI-6780,...,YK 4-279,YM201636,Z-LLNle-CHO,Z-Leu-leu-leu-al,ZG-10,ZM-447439,ZSTK474,Zibotentan,kb NB 142-70,rTRAIL
KMH-2,8.863282,8.722027,,,0.623629,,,4331.870162,4.875354,,...,1.665832,,0.836372,0.182579,,,2.483312,,8.178379,
KMH-2,,0.376778,,,,,,,3.055368,,...,1.417826,,,,3.841884,,1.837014,,2.677079,0.021962
PC-3,,5.699649,,,,,,,5.214366,,...,4.72078,5.387855,,,,,0.555285,,6.585827,119.043912
PC-3,,4.123904,,,,,,,4.137289,,...,,,,,,,0.68704,,8.570738,
TT,,4.86551,,2.147218,,,,,16.93154,,...,1.325166,4.480454,,,,,,,11.446667,
TT,,13.932949,,2.39285,,,,,32.128805,,...,42.488595,6.8281,,,,2.255354,,,7.62226,


# GDSC2

In [134]:
# データ読み込み
path = DATA_DIR / "gdsc2_data"

# 各データセット
drugAct = pd.read_csv("../../PharmacoDB2binary/data/gdsc2.csv")
drugAct = drugAct.pivot_table(index="cell_line", columns="drug", values="IC50")
drugAct.index = list(drugAct.index)
drugAct.columns = list(drugAct.columns)
conv = dict(pd.read_csv("pharmacodb_GDSC1.csv").dropna()[["pdb", "rcm"]].values)
renamed = drugAct[drugAct.index.isin(list(conv.keys()))]
renamed.index = [conv[i] for i in renamed.index]
gene_exp_files = sorted(path.glob("gene_exp_part*.csv.gz"))
exprs = pd.concat([pd.read_csv(f, index_col=0).T for f in gene_exp_files], axis=1)
exprs = exprs[exprs.notna().sum(axis=1) != 0]
mut = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("mut*"))]).T
met = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("met*"))]).T
cop = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("cop*"))]).T

# 共通細胞株
common_cells = (
    set(renamed.index)
    & set(exprs.index)
    & set(mut.index)
    & set(met.index)
    & set(cop.index)
)
np.save("gdsc2_cells.npy", np.array(list(common_cells)))  # 保存


# 統計取得関数
def get_stats(name, df):
    return {
        "Dataset": name,
        "Num Cells": int(df.shape[0]),
        "Num Features": int(df.shape[1]),
        "Missing Values": int(df.isna().sum().sum()),
    }


# 各データセットの統計
stats = [
    get_stats("Gene Expression", exprs),
    get_stats("Mutation", mut),
    get_stats("Methylation", met),
    get_stats("Copy Number Variation", cop),
    get_stats("Original Drug Activity", drugAct),
    get_stats("Renamed Drug Activity", renamed),
]

# 共通細胞株数の情報（NAを使う）
stats.append(
    {
        "Dataset": "Common Cell Lines",
        "Num Cells": len(common_cells),
        "Num Features": pd.NA,
        "Missing Values": pd.NA,
    }
)

stats.append(
    {
        "Dataset": "Duplicated Cell Lines",
        "Num Cells": sum(renamed.index.duplicated()),
        "Num Features": pd.NA,
        "Missing Values": pd.NA,
    }
)

# DataFrame化し、型を明示
stats_df = pd.DataFrame(stats).astype(
    {
        "Dataset": "string",
        "Num Cells": "int",
        "Num Features": "Int64",  # nullable integer
        "Missing Values": "Int64",
    }
)
stats_df

Unnamed: 0,Dataset,Num Cells,Num Features,Missing Values
0,Gene Expression,987,19562.0,18974.0
1,Mutation,1084,18099.0,1321227.0
2,Methylation,1084,19864.0,2174189.0
3,Copy Number Variation,1084,24502.0,3009817.0
4,Original Drug Activity,809,188.0,82382.0
5,Renamed Drug Activity,808,188.0,82263.0
6,Common Cell Lines,802,,
7,Duplicated Cell Lines,2,,


In [135]:
np.array(set(renamed.index) - common_cells)

array({'NCI-H508', 'MDA-MB-175-VII', 'KMH-2', 'MHH-ES-1'}, dtype=object)

In [136]:
set(drugAct.index) - set(conv.keys())

{'NTERA-2'}

In [137]:
renamed.index[renamed.index.duplicated()]

Index(['PC-3', 'TT'], dtype='object')

# CTRP

In [138]:
# データ読み込み
path = DATA_DIR / "ctrp_data"

# 各データセット
drugAct = pd.read_csv("../../PharmacoDB2binary/data/ctrp.csv")
drugAct = drugAct.pivot_table(index="cell_line", columns="drug", values="IC50")
drugAct.index = list(drugAct.index)
drugAct.columns = list(drugAct.columns)
conv = dict(pd.read_csv("pharmacodb_CTRP.csv").dropna()[["pdb", "rcm"]].values)
renamed = drugAct[drugAct.index.isin(list(conv.keys()))]
renamed.index = [conv[i] for i in renamed.index]
gene_exp_files = sorted(path.glob("gene_exp_part*.csv.gz"))
exprs = pd.concat([pd.read_csv(f, index_col=0).T for f in gene_exp_files], axis=1)
exprs = exprs[exprs.notna().sum(axis=1) != 0]
mut = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("mut*"))]).T
met = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("met*"))]).T
cop = pd.concat([pd.read_csv(f, index_col=0) for f in sorted(path.glob("cop*"))]).T

# 共通細胞株
common_cells = (
    set(renamed.index)
    & set(exprs.index)
    & set(mut.index)
    & set(met.index)
    & set(cop.index)
)
np.save("ctrp_cells.npy", np.array(list(common_cells)))  # 保存


# 統計取得関数
def get_stats(name, df):
    return {
        "Dataset": name,
        "Num Cells": int(df.shape[0]),
        "Num Features": int(df.shape[1]),
        "Missing Values": int(df.isna().sum().sum()),
    }


# 各データセットの統計
stats = [
    get_stats("Gene Expression", exprs),
    get_stats("Mutation", mut),
    get_stats("Methylation", met),
    get_stats("Copy Number Variation", cop),
    get_stats("Original Drug Activity", drugAct),
    get_stats("Renamed Drug Activity", renamed),
]

# 共通細胞株数の情報（NAを使う）
stats.append(
    {
        "Dataset": "Common Cell Lines",
        "Num Cells": len(common_cells),
        "Num Features": pd.NA,
        "Missing Values": pd.NA,
    }
)

stats.append(
    {
        "Dataset": "Duplicated Cell Lines",
        "Num Cells": sum(renamed.index.duplicated()),
        "Num Features": pd.NA,
        "Missing Values": pd.NA,
    }
)

# DataFrame化し、型を明示
stats_df = pd.DataFrame(stats).astype(
    {
        "Dataset": "string",
        "Num Cells": "int",
        "Num Features": "Int64",  # nullable integer
        "Missing Values": "Int64",
    }
)
stats_df

Unnamed: 0,Dataset,Num Cells,Num Features,Missing Values
0,Gene Expression,1036,19851.0,0.0
1,Mutation,1089,1667.0,320064.0
2,Methylation,1089,19880.0,5234995.0
3,Copy Number Variation,1090,23316.0,8980235.0
4,Original Drug Activity,887,496.0,222597.0
5,Renamed Drug Activity,837,496.0,209416.0
6,Common Cell Lines,820,,
7,Duplicated Cell Lines,0,,


In [139]:
np.array(
    set(drugAct.index) - set(list(conv.keys())) | (set(renamed.index) - common_cells)
)

array({'ASKA', 'BT232', 'BT428', 'RPMI-6666', 'SYO1', 'BT333', 'JHUEM7', 'BT340', 'HS888LU', 'U-CH1', 'LC1SQSF', 'HS578BST', 'SW982', 'BT504', 'BT228', 'BT271', 'HeLa', 'BT440', 'SF268', 'OE21', 'BT224', 'OVCAR-5', 'PANC1', 'BT216', 'BT286', 'BT422', 'BT187', 'DOV13', 'BT239', 'SNB75', 'YAMATO', 'TTC709', 'KRIJ', 'BT145', 'BT179', 'BT164', 'BT231', 'BT444', 'BT320', 'BT248', 'SF539', '2004', 'HSTS', 'BT416', 'BT359', 'TM87-16', 'BT12', 'COLO699', 'BT328', 'EKVX', 'BT245', 'UO31', 'BT139', 'TTTHYROID', 'BT498', 'HOP62', 'MOLT3', 'BT131', 'PSTS', 'BT172', 'BT147', 'BT112', 'BT330', 'HOP92', 'BT159', 'BT16', 'BT482'},
      dtype=object)

In [140]:
np.array(set(renamed.index) - common_cells)

array({'BT12', 'HOP62', 'MOLT3', 'RPMI-6666', 'EKVX', 'DOV13', 'JHUEM7', 'HeLa', 'SNB75', 'SF539', 'SF268', 'TTC709', 'COLO699', 'OE21', 'HOP92', 'OVCAR-5', 'UO31'},
      dtype=object)

In [413]:
tmp = pd.DataFrame(
    np.array(
        [
            [
                i,
                str(i)
                .replace("-", "")
                .replace("/", "")
                .replace(" ", "")
                .replace(".", "")
                .upper(),
            ]
            for i in mut.index
        ]
    )
)
tmp.columns = ["rcm", "naive"]

In [600]:
k = pd.DataFrame(
    np.array(
        [
            [
                i,
                str(i)
                .replace("-", "")
                .replace("/", "")
                .replace(" ", "")
                .replace(".", "")
                .upper(),
            ]
            for i in missing
        ]
    )
)
k.columns = ["pdb", "naive"]
k

Unnamed: 0,pdb,naive
0,CFPAC1,CFPAC1
1,COLO680N,COLO680N
2,DANG,DANG
3,EFO21,EFO21
4,HCC33,HCC33
5,HT29,HT29
6,ISTMES1,ISTMES1
7,JHH6,JHH6
8,JIMT1,JIMT1
9,LC1SQSF,LC1SQSF


In [620]:
for val in tmp[tmp.naive.str.contains("21".upper())]["rcm"]:
    print(val)

CHP-212
COR-L321
EFO-21
HCC2157
HCC2218
Hep3B2-1-7
KY821
LIM1215
MSTO-211H
NCC021
NCI-H2107
NCI-H211
NCI-H2110
NCI-H2122
NCI-H2126
NCI-H2135
NCI-H2141
NCI-H2170
NCI-H2171
NCI-H2172
NCI-H2196
OE21
PL-21
RVH-421
SCLC-21H
STS-0421
TOV-21G


In [457]:
tmp

Unnamed: 0,rcm,naive
0,1321N1,1321N1
1,143B,143B
2,22Rv1,22RV1
3,23132/87,2313287
4,253J,253J
...,...,...
1084,RPMI-6666,RPMI6666
1085,DOV13,DOV13
1086,COLO699,COLO699
1087,OVCAR-5,OVCAR5


In [456]:
print(
    [
        "2004",
        "ASKA",
        "BT112",
        "BT131",
        "BT139",
        "BT145",
        "BT147",
        "BT159",
        "BT16",
        "BT164",
        "BT172",
        "BT179",
        "BT187",
        "BT216",
        "BT224",
        "BT228",
        "BT231",
        "BT232",
        "BT239",
        "BT245",
        "BT248",
        "BT271",
        "BT286",
        "BT320",
        "BT328",
        "BT330",
        "BT333",
        "BT340",
        "BT359",
        "BT416",
        "BT422",
        "BT428",
        "BT440",
        "BT444",
        "BT482",
        "BT498",
        "BT504",
        "HS578BST",
        "HS888LU",
        "HSTS",
        "KRIJ",
        "PSTS",
        "SW982",
        "SYO1",
        "TM8716",
        "TTTHYROID",
        "UCH1",
        "YAMATO",
    ]
)

['2004', 'ASKA', 'BT112', 'BT131', 'BT139', 'BT145', 'BT147', 'BT159', 'BT16', 'BT164', 'BT172', 'BT179', 'BT187', 'BT216', 'BT224', 'BT228', 'BT231', 'BT232', 'BT239', 'BT245', 'BT248', 'BT271', 'BT286', 'BT320', 'BT328', 'BT330', 'BT333', 'BT340', 'BT359', 'BT416', 'BT422', 'BT428', 'BT440', 'BT444', 'BT482', 'BT498', 'BT504', 'HS578BST', 'HS888LU', 'HSTS', 'KRIJ', 'PSTS', 'SW982', 'SYO1', 'TM8716', 'TTTHYROID', 'UCH1', 'YAMATO']


In [558]:
tmp = pd.DataFrame(
    np.array(
        [
            [
                i,
                str(i)
                .replace("-", "")
                .replace("/", "")
                .replace(" ", "")
                .replace(".", "")
                .upper(),
            ]
            for i in mut.index
        ]
    )
)
tmp.columns = ["rcm", "naive"]
k = pd.DataFrame(
    np.array(
        [
            [
                i,
                str(i)
                .replace("-", "")
                .replace("/", "")
                .replace(" ", "")
                .replace(".", "")
                .upper(),
            ]
            for i in drugAct.index
        ]
    )
)
k.columns = ["pdb", "naive"]

In [559]:
j = k.merge(tmp, how="left")
j[np.any(j.isna(), axis=1)].head(50)

Unnamed: 0,pdb,naive,rcm
10,786-O,786O,
28,A3/Kawakami,A3KAWAKAMI,
29,A4/Fukuda,A4FUKUDA,
72,C32 [Human melanoma],C32[HUMANMELANOMA],
134,CS-1 [Human chondrosarcoma],CS1[HUMANCHONDROSARCOMA],
182,EB3 [Human Burkitt lymphoma],EB3[HUMANBURKITTLYMPHOMA],
222,EoL-1,EOL1,
254,HARA [Human squamous cell lung carcinoma],HARA[HUMANSQUAMOUSCELLLUNGCARCINOMA],
289,HH [Human lymphoma],HH[HUMANLYMPHOMA],
317,Hep-G2/C3A,HEPG2C3A,


In [563]:
for val in tmp[tmp.naive.str.contains("H2722".upper())]["rcm"]:
    print(val)

H2722


In [564]:
for val in k[k.naive.str.contains("H2722".upper())]["pdb"]:
    print(val)

NCI-H2722


In [493]:
# pd.concat([
#     pd.read_csv('pharmacodb_GDSC1.csv', index_col=0).dropna(),
#     k.merge(tmp, how='left')
# ]).drop_duplicates().sort_values('naive').reset_index(drop=True).to_csv('pharmacodb_GDSC1.csv')

In [524]:
o = pd.read_csv("pharmacodb_GDSC1.csv", index_col=0)
o

Unnamed: 0,pdb,naive,rcm
0,201T,201T,201T
1,22Rv1,22RV1,22RV1
2,23132/87,2313287,23132-87
3,42-MG-BA,42MGBA,42-MG-BA
4,451Lu,451LU,451Lu
...,...,...,...
1079,YH-13,YH13,YH-13
1080,YKG-1,YKG1,YKG-1
1081,YMB-1-E,YMB1E,YMB-1-E
1082,YT,YT,YT


In [512]:
o[o.pdb == "Hep 3B2.1-7"]

Unnamed: 0,pdb,naive,rcm
296,Hep 3B2.1-7,HEP3B2.17,Hep3B2-1-7
297,Hep 3B2.1-7,HEP3B217,Hep3B2-1-7


In [401]:
renamed.index[renamed.index.duplicated()]

Index(['PC-3', 'TT', 'KMH-2'], dtype='object')

In [450]:
set(drugAct.index) - set(conv.keys())

{'NTERA-2'}