# Overview
1. Normal split
   
**Import**:
   - `sensitivity/DrugSens.csv`

**Export**:
   - `sensitivity/pivot`
   - `sensitivity/stack`

# Method

Import files

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedGroupKFold
import pickle as pkl
import csv
import os
import pickle as pkl

In [2]:
raw_df = pd.read_csv('datasets/sensitivity/DrugSens.csv', index_col=0)
splitter = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)

In [3]:
grouped_df = raw_df.groupby("cell_line").size().reset_index(name='count')
sorted_df = grouped_df.sort_values(by='count', ascending=False)
print(sorted_df)
grouped_df = raw_df.groupby("gdsc_name").size().reset_index(name='count')
sorted_df = grouped_df.sort_values(by='count', ascending=False)
print(sorted_df)

      cell_line  count
383  ACH-000651    228
317  ACH-000552    228
15   ACH-000030    228
339  ACH-000580    228
118  ACH-000219    227
..          ...    ...
340  ACH-000581    138
610  ACH-000999    105
438  ACH-000743     88
432  ACH-000725     13
409  ACH-000689     11

[676 rows x 2 columns]
          gdsc_name  count
121          MG-132    676
127         MK-2206    675
191   STAUROSPORINE    675
1    5-FLUOROURACIL    675
151       PD0325901    675
..              ...    ...
203       TRETINOIN    128
188          SL0101    128
46            BX795    128
51       CHIR-99021    128
48       CCT-018159    128

[228 rows x 2 columns]


In [4]:
grouped_df = raw_df.groupby("cancer_type").size().reset_index(name='count')
sorted_df = grouped_df.sort_values(by='count', ascending=False)
print(sorted_df)

                   cancer_type  count
13                 Lung Cancer  27243
11                    Leukemia  10464
3                Breast Cancer   9046
5      Colon/Colorectal Cancer   8719
14                    Lymphoma   8609
2                 Brain Cancer   7444
21                 Skin Cancer   7111
17              Ovarian Cancer   6656
7            Esophageal Cancer   5901
18           Pancreatic Cancer   5656
8               Gastric Cancer   4609
9         Head and Neck Cancer   4244
10               Kidney Cancer   3405
1                  Bone Cancer   3366
16               Neuroblastoma   3242
0               Bladder Cancer   3240
15                     Myeloma   3123
6   Endometrial/Uterine Cancer   2900
12                Liver Cancer   2754
20                     Sarcoma   2311
4              Cervical Cancer   2299
22              Thyroid Cancer   2290
19             Prostate Cancer   1238


In [5]:
grouped_df = raw_df.groupby("cancer_type")['cell_line'].nunique()
print(grouped_df)

cancer_type
Bladder Cancer                 16
Bone Cancer                    16
Brain Cancer                   37
Breast Cancer                  45
Cervical Cancer                11
Colon/Colorectal Cancer        42
Endometrial/Uterine Cancer     14
Esophageal Cancer              29
Gastric Cancer                 23
Head and Neck Cancer           21
Kidney Cancer                  17
Leukemia                       53
Liver Cancer                   13
Lung Cancer                   137
Lymphoma                       43
Myeloma                        15
Neuroblastoma                  17
Ovarian Cancer                 34
Pancreatic Cancer              28
Prostate Cancer                 6
Sarcoma                        12
Skin Cancer                    36
Thyroid Cancer                 11
Name: cell_line, dtype: int64


In [8]:
def save_drugsens(df_regr: pd.DataFrame, filename: str):
    if filename[-4:] != '.csv':
        filename += '.csv'
    df_regr.sort_values(by=['gdsc_name','cell_line'], inplace=True)
    df_regr_pivotted = df_regr.pivot(index = 'gdsc_name', columns = 'cell_line', values = 'IC50')
    df_regr_pivotted.columns.name = ''
    df_regr_pivotted.index.name = 'drug_name'

    df_clas = df_regr.copy()
    df_clas["IC50"] = df_clas["IC50"].apply(lambda x: 1 if x >= -1 else 0)
    df_clas_pivotted = df_clas.pivot(index = 'gdsc_name', columns = 'cell_line', values = 'IC50')
    df_clas_pivotted.columns.name = ''
    df_clas_pivotted.index.name = 'drug_name'
    
    df_regr.to_csv('datasets/sensitivity/stack/regr/' + filename, index=False)
    df_clas.to_csv('datasets/sensitivity/stack/clas/' + filename, index=False)

    df_regr_pivotted.to_csv('datasets/sensitivity/pivot/regr/' + filename)
    df_clas_pivotted.to_csv('datasets/sensitivity/pivot/clas/' + filename)

## Normal split

In [9]:
for seed in range(0,2000, 54):
    train_df, test_df = train_test_split(raw_df,test_size=0.20, random_state=seed, stratify=raw_df['cell_line'])
    train_counts = train_df.groupby("gdsc_name").size().reset_index(name='count').sort_values(by='gdsc_name', ascending=True)
    test_counts = test_df.groupby("gdsc_name").size().reset_index(name='count').sort_values(by='gdsc_name', ascending=True)
    all_counts = raw_df.groupby("gdsc_name").size().reset_index(name='count').sort_values(by='gdsc_name', ascending=True)
    all_counts["test_count"] = test_counts["count"]

    all_counts["ratio"] = all_counts["test_count"] / all_counts["count"]
    all_counts.loc[all_counts['ratio'] == all_counts['ratio'].max()]
    print(seed, all_counts['ratio'].mean(), all_counts['ratio'].std(), all_counts['ratio'].max(), all_counts['ratio'].min())

0 0.19948413718333877 0.017325762148525364 0.23908918406072105 0.1328125
54 0.19916546067327226 0.017832266148282833 0.2734375 0.12727272727272726
108 0.19934855586746 0.01826111579680017 0.24096385542168675 0.140625
162 0.20039337059422435 0.01935192847685036 0.2891566265060241 0.13253012048192772
216 0.19977591086669988 0.016446427448335858 0.25946969696969696 0.15625
270 0.199571644944076 0.01668869400371568 0.25 0.13855421686746988
324 0.20012954410167327 0.01706191649244701 0.2578125 0.15060240963855423
378 0.20040122682676728 0.017282731243662097 0.2734375 0.15625
432 0.19965416122188495 0.017151425329443425 0.28125 0.140625
486 0.19964943028008053 0.018072640369610905 0.25301204819277107 0.15625
540 0.19941705691958567 0.01793156440156358 0.26370757180156656 0.125
594 0.20031663722582874 0.01842285080620973 0.2734375 0.14736842105263157
648 0.19972403398915392 0.017440192683041783 0.25301204819277107 0.1484375
702 0.20022898407059278 0.018451128107192852 0.2578125 0.144578313253

In [10]:
train_df, test_df = train_test_split(raw_df,test_size=0.20, random_state=50, stratify=raw_df['cell_line'])

train_counts = train_df.groupby("cell_line").size().reset_index(name='count').sort_values(by='cell_line', ascending=True)
test_counts = test_df.groupby("cell_line").size().reset_index(name='count').sort_values(by='cell_line', ascending=True)
all_counts = raw_df.groupby("cell_line").size().reset_index(name='count').sort_values(by='cell_line', ascending=True)
all_counts["test_count"] = test_counts["count"]

all_counts["ratio"] = all_counts["test_count"] / all_counts["count"]
all_counts.loc[all_counts['ratio'] == all_counts['ratio'].max()]
all_counts['ratio'].mean(), all_counts['ratio'].std()

(np.float64(0.19999925434527918), np.float64(0.00231922890842857))

In [11]:
print("== Normal split summary ==")
print("Number of drugs on train set:", train_df['gdsc_name'].nunique())
print("Number of drugs on test set:", test_df['gdsc_name'].nunique())

print("Number of cell lines on train set:", train_df['cell_line'].nunique())
print("Number of cell lines on test set:", test_df['cell_line'].nunique())
print("Number of samples on train set:", len(train_df))
print("Number of samples on test set:", len(test_df))

== Normal split summary ==
Number of drugs on train set: 228
Number of drugs on test set: 228
Number of cell lines on train set: 676
Number of cell lines on test set: 676
Number of samples on train set: 108696
Number of samples on test set: 27174


In [12]:
tvhyper_df, tvhyper_subsampling_df = train_test_split(train_df,test_size=0.50, random_state=50, stratify=train_df['cell_line'])
trainhyper_subsampling_df, validhyper_subsampling_df = train_test_split(tvhyper_subsampling_df,test_size=0.20, random_state=50, stratify=tvhyper_subsampling_df['cell_line'])
train_counts = trainhyper_subsampling_df.groupby("cell_line").size().reset_index(name='count').sort_values(by='cell_line', ascending=True)
test_counts = validhyper_subsampling_df.groupby("cell_line").size().reset_index(name='count').sort_values(by='cell_line', ascending=True)
all_counts = tvhyper_subsampling_df.groupby("cell_line").size().reset_index(name='count').sort_values(by='cell_line', ascending=True)
all_counts["test_count"] = test_counts["count"]

all_counts["ratio"] = all_counts["test_count"] / all_counts["count"]
all_counts.loc[all_counts['ratio'] == all_counts['ratio'].max()]
all_counts['ratio'].mean(), all_counts['ratio'].std()

(np.float64(0.20000503297255207), np.float64(0.0030310613902500756))

### Export

In [None]:
os.makedirs('datasets/sensitivity/stack/regr/', exist_ok=True)
os.makedirs('datasets/sensitivity/stack/clas/', exist_ok=True)
os.makedirs('datasets/sensitivity/pivot/regr/', exist_ok=True)
os.makedirs('datasets/sensitivity/pivot/clas/', exist_ok=True)

save_drugsens(train_df,'DrugSens-Train.csv')
save_drugsens(test_df,'DrugSens-Test.csv')
save_drugsens(trainhyper_subsampling_df,'DrugSens-Trainhyper-Subsampling.csv')
save_drugsens(validhyper_subsampling_df,'DrugSens-Validhyper-Subsampling.csv')