# NCI60 exploratory analysis

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/Users/sethhowes/Desktop/FS-Tox/data/raw/nci60_2023.csv")

In [3]:
df.head()

Unnamed: 0,RELEASE_DATE,EXPID,PREFIX,NSC,CONCENTRATION_UNIT,LOG_HI_CONCENTRATION,CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT_GIPRCNT,AVERAGE_GIPRCNT,STDDEV_GIPRCNT,COUNT_PTC,AVERAGE_PTC,STDDEV_PTC
0,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,1,29,Non-Small Cell Lung Cancer,HOP-92,LNS,1,-45.9064,0.0,1,22.4242,0.0
1,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,12,5,CNS Cancer,SNB-75,CNS,1,-65.7534,0.0,1,14.5068,0.0
2,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,4,1,Colon Cancer,HT29,COL,1,-68.2635,0.0,1,6.8123,0.0
3,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,4,9,Colon Cancer,SW-620,COL,1,-15.9259,0.0,1,20.047,0.0
4,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,5,12,Melanoma,MDA-N,MEL,1,-78.4091,0.0,1,5.9468,0.0


In [42]:
df.shape

(23485461, 18)

CONCENTRATION/RESPONSE DATA
- RELEASE_DATE The date of this data release.
- EXPID Please see the General Comments above.
- PREFIX The identifier of the sequence from which an NSC number was assigned. All public data are in the S series.
- NSC The numeric identifier in the S series.
- CONCENTRATION_UNIT Please see the General Comments above.
- LOG_HI_CONCENTRATION The log10 of the highest concentration of the concentration/response data.
- CONCENTRATION The log10 of the concentration in the dilution series.
- PANEL_NUMBER Internal identifier. The combinations of panel_number and cell_number are unique cell line identifiers.
- CELL_NUMBER Internal identifier. The combinations of panel_number and cell_number are unique cell line identifiers.
- PANEL_NAME The name of the NCI cell line panel (cancer type).
- CELL_NAME The name of the NCI cell line.
- PANEL_CODE An abbreviation for the panel_name.
- COUNT_GIPRCNT Count of GIPRCNT values.
- AVERAGE_GIPRCNT Average of GIPRCNT values.
- STDDEV_GIPRCNT Standard deviation of GIPRCNT values.
- COUNT_PTC Count of PTC values.
- AVERAGE_PTC Average of PTC values.
- STDDEV_PTC Standard deviation of PTC values.

From what I can understand from the following [NCI60 documentation](https://wiki.nci.nih.gov/display/NCIDTPdata/NCI-60+Growth+Inhibition+Data), the raw values above are interpolated to derive LC50, IC50, GI50, and TGI values. Each of these endpoints is in its own file. I will now check to see whether this raw information is processed into multiple target endpoints.

In [28]:
gi50_df = pd.read_csv("/Users/sethhowes/Desktop/FS-Tox/data/raw/nci60/GI50.csv")
ic50_df = pd.read_csv("/Users/sethhowes/Desktop/FS-Tox/data/raw/nci60/IC50.csv")
lc50_df = pd.read_csv("/Users/sethhowes/Desktop/FS-Tox/data/raw/nci60/LC50.csv")
oneconc_df = pd.read_csv("/Users/sethhowes/Desktop/FS-Tox/data/raw/nci60/ONECONC.csv")
tgi_df = pd.read_csv("/Users/sethhowes/Desktop/FS-Tox/data/raw/nci60/TGI.csv")

In [29]:
gi50_df.head()

Unnamed: 0,RELEASE_DATE,EXPID,PREFIX,NSC,CONCENTRATION_UNIT,LOG_HI_CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT,AVERAGE,STDDEV
0,20210223,0001MD02,S,123127,M,-4.6021,1,1,Non-Small Cell Lung Cancer,NCI-H23,LNS,1,-7.1391,0.0
1,20210223,0001MD02,S,123127,M,-4.6021,10,14,Melanoma,M14,MEL,1,-7.052,0.0
2,20210223,0001MD02,S,123127,M,-4.6021,12,5,CNS Cancer,SNB-75,CNS,1,-7.138,0.0
3,20210223,0001MD02,S,123127,M,-4.6021,4,2,Colon Cancer,HCC-2998,COL,1,-6.9426,0.0
4,20210223,0001MD02,S,123127,M,-4.6021,5,5,Breast Cancer,MDA-MB-231/ATCC,BRE,1,-6.4485,0.0


In [30]:
ic50_df.head()

Unnamed: 0,RELEASE_DATE,EXPID,PREFIX,NSC,CONCENTRATION_UNIT,LOG_HI_CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT,AVERAGE,STDDEV
0,20210223,0001MD02,S,123127,M,-4.6021,10,20,Melanoma,UACC-62,MEL,1,-6.8434,0.0
1,20210223,0001MD02,S,123127,M,-4.6021,10,21,Melanoma,UACC-257,MEL,1,-6.1932,0.0
2,20210223,0001MD02,S,123127,M,-4.6021,11,1,Prostate Cancer,PC-3,PRO,1,-6.3816,0.0
3,20210223,0001MD02,S,123127,M,-4.6021,12,2,CNS Cancer,SNB-19,CNS,1,-7.0474,0.0
4,20210223,0001MD02,S,123127,M,-4.6021,4,1,Colon Cancer,HT29,COL,1,-6.4681,0.0


In [31]:
lc50_df.head()

Unnamed: 0,RELEASE_DATE,EXPID,PREFIX,NSC,CONCENTRATION_UNIT,LOG_HI_CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT,AVERAGE,STDDEV
0,20210223,0001MD02,S,123127,M,-4.6021,10,21,Melanoma,UACC-257,MEL,1,-4.6021,0.0
1,20210223,0001MD02,S,123127,M,-4.6021,12,5,CNS Cancer,SNB-75,CNS,1,-4.8872,0.0
2,20210223,0001MD02,S,123127,M,-4.6021,5,6,Breast Cancer,HS 578T,BRE,1,-4.6021,0.0
3,20210223,0001MD02,S,123127,M,-4.6021,7,5,Leukemia,K-562,LEU,1,-4.6021,0.0
4,20210223,0001MD02,S,123127,M,-4.6021,9,15,Renal Cancer,CAKI-1,REN,1,-4.6021,0.0


In [32]:
oneconc_df.head()

Unnamed: 0,RELEASE_DATE,EXPID,PREFIX,NSC,CONCENTRATION_UNIT,CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT_GIPRCNT,AVERAGE_GIPRCNT,STDDEV_GIPRCNT
0,20210223,0001OM01,S,123127,M,2.5e-05,1,1,Non-Small Cell Lung Cancer,NCI-H23,LNS,1,-100.0,0.0
1,20210223,0001OM01,S,123127,M,2.5e-05,1,17,Non-Small Cell Lung Cancer,NCI-H322M,LNS,1,3.0511,0.0
2,20210223,0001OM01,S,123127,M,2.5e-05,1,4,Non-Small Cell Lung Cancer,A549/ATCC,LNS,1,0.5431,0.0
3,20210223,0001OM01,S,123127,M,2.5e-05,10,14,Melanoma,M14,MEL,1,12.5919,0.0
4,20210223,0001OM01,S,123127,M,2.5e-05,10,5,Melanoma,SK-MEL-2,MEL,1,-100.0,0.0


In [33]:
tgi_df.head()

Unnamed: 0,RELEASE_DATE,EXPID,PREFIX,NSC,CONCENTRATION_UNIT,LOG_HI_CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT,AVERAGE,STDDEV
0,20210223,0001MD02,S,123127,M,-4.6021,1,1,Non-Small Cell Lung Cancer,NCI-H23,LNS,1,-6.2479,0.0
1,20210223,0001MD02,S,123127,M,-4.6021,1,21,Non-Small Cell Lung Cancer,NCI-H460,LNS,1,-6.4512,0.0
2,20210223,0001MD02,S,123127,M,-4.6021,1,29,Non-Small Cell Lung Cancer,HOP-92,LNS,1,-5.4165,0.0
3,20210223,0001MD02,S,123127,M,-4.6021,1,8,Non-Small Cell Lung Cancer,EKVX,LNS,1,-5.1418,0.0
4,20210223,0001MD02,S,123127,M,-4.6021,10,20,Melanoma,UACC-62,MEL,1,-6.6382,0.0


In [10]:
gi50_df.shape

(4630727, 14)

In [9]:
ic50_df.shape

(4656639, 14)

In [36]:
lc50_df.shape

(4603904, 14)

In [35]:
oneconc_df.shape

(4429862, 14)

In [37]:
tgi_df.shape

(4616477, 14)

There is a similar number of records between both datasets. I will now check the counts of records for a few EXPIDs.

In [19]:
(ic50_df["EXPID"] == "0001MD02").sum()

114

In [20]:
(gi50_df["EXPID"] == "0001MD02").sum()

110

In [38]:
(lc50_df["EXPID"] == "0001MD02").sum()

575

In [39]:
(oneconc_df["EXPID"] == "0001MD02").sum()

0

In [40]:
(tgi_df["EXPID"] == "0001MD02").sum()

114

In [21]:
(df["EXPID"] == "0001MD02").sum()

575

In [23]:
df[df["EXPID"] == "0001MD02"]

Unnamed: 0,RELEASE_DATE,EXPID,PREFIX,NSC,CONCENTRATION_UNIT,LOG_HI_CONCENTRATION,CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT_GIPRCNT,AVERAGE_GIPRCNT,STDDEV_GIPRCNT,COUNT_PTC,AVERAGE_PTC,STDDEV_PTC
0,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,1,29,Non-Small Cell Lung Cancer,HOP-92,LNS,1,-45.9064,0.0,1,22.4242,0.0
1,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,12,5,CNS Cancer,SNB-75,CNS,1,-65.7534,0.0,1,14.5068,0.0
2,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,4,1,Colon Cancer,HT29,COL,1,-68.2635,0.0,1,6.8123,0.0
3,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,4,9,Colon Cancer,SW-620,COL,1,-15.9259,0.0,1,20.0470,0.0
4,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,5,12,Melanoma,MDA-N,MEL,1,-78.4091,0.0,1,5.9468,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5872501,20210223,0001MD02,S,19893,M,-2.6021,-6.6021,4,9,Colon Cancer,SW-620,COL,1,101.6686,0.0,1,101.3347,0.0
5872502,20210223,0001MD02,S,19893,M,-2.6021,-6.6021,5,12,Melanoma,MDA-N,MEL,1,130.2675,0.0,1,122.7080,0.0
5872503,20210223,0001MD02,S,19893,M,-2.6021,-6.6021,7,3,Leukemia,CCRF-CEM,LEU,1,90.9784,0.0,1,93.4307,0.0
5872504,20210223,0001MD02,S,19893,M,-2.6021,-6.6021,7,5,Leukemia,K-562,LEU,1,93.9297,0.0,1,95.3975,0.0
