In [None]:
#cell line annotations for CCLE characterisation from https://www.nature.com/articles/s41467-023-41132-w#MOESM3
#ccle 1019 has limited info on which cell lines are head and neck scc (ie: they only label as upper aerodigestive tract)
#this calls out ccle cell lines for hnn scc
#/content/drive/MyDrive/data/41467_2023_41132_MOESM3_ESM.xlsx
cell_line_annots = pd.read_excel('/content/drive/MyDrive/data/41467_2023_41132_MOESM3_ESM.xlsx')
cell_line_annots.columns = cell_line_annots.iloc[0]
cell_line_annots = cell_line_annots[1:]
list(cell_line_annots.columns)
cell_line_annots_trunc = cell_line_annots[['Cell line name', 'Primary disease', 'Disease subtype', 'Primary/metastasis']]
cell_line_annots_trunc['Cell line name'] = (
    cell_line_annots_trunc['Cell line name']
    .str.replace(r'[\[\]{}()]', '', regex=True)  # Remove brackets but keep contents
    .str.replace(r'[-/ ]', '', regex=True)  # Remove '-', '/', and spaces
    .str.upper()  # Convert to uppercase
)

cell_line_annots_trunc

# Display unique values in the 'Primary disease' column
##unique_primary_disease = cell_line_annots['Primary disease'].unique()
#print("Unique values in 'Primary disease':")
#print(unique_primary_disease)

# Display unique values in the 'Sample collection site' column
##unique_sample_collection_site = cell_line_annots['Sample collection site'].unique()
#print("\nUnique values in 'Sample collection site':")
#print(unique_sample_collection_site)

# Display unique values in the 'Disease subtype' column
##unique_disease_subtype = cell_line_annots['Disease subtype'].unique()
#print("\nUnique values in 'Disease subtype':")
#print(unique_disease_subtype)

#### use the above for options to customise cancer  type ####
#choose "Primary disease"== 'Head and Neck Cancer', 'Disease subtype' == anything that contains Squamous Cell Carcinoma
hnscc_ccle_annots = cell_line_annots_trunc[
    (cell_line_annots_trunc['Primary disease'] == 'Head and Neck Cancer') &
    (cell_line_annots_trunc['Disease subtype'].str.contains('Squamous Cell Carcinoma', na=False))
]

hnscc_ccle_annots

##getting list of ccle hnscc with cisplatin ic50 values by making the cell line name columns compatible for merge function
##cisplatin ic50 values from secondary-screen-dose-response-curve-parameters.csv

import pandas as pd
secdscreen = pd.read_csv('/content/drive/MyDrive/data/secondary-screen-dose-response-curve-parameters.csv', usecols=['ccle_name', 'ic50', 'name'])
secdscreen = secdscreen.dropna(subset=['ic50'])
secdscreen

cisplatin_secdscreen = secdscreen[secdscreen['name'].str.contains('cisplatin', na=False)].copy()
cisplatin_secdscreen

def safe_split(value):
    """Safely split the string by '_' into two parts or return None."""
    parts = str(value).split('_', 1)  # Split only once
    if len(parts) == 2:
        return parts
    return [parts[0], None]

# Apply the safe split function to 'ccle_name'
split_values = cisplatin_secdscreen['ccle_name'].apply(safe_split)

# Extract the results into two new columns
cisplatin_secdscreen['cell_line_name'] = split_values.apply(lambda x: x[0])
cisplatin_secdscreen['cancer_type'] = split_values.apply(lambda x: x[1])



cisplatin_secdscreen
cell_line_annots_trunc
cell_line_annots_trunc.rename(columns={'Cell line name': 'cell_line_name'}, inplace=True)


cisplatin_secdscreen_filter = pd.merge(cisplatin_secdscreen, cell_line_annots_trunc, on='cell_line_name', how='left').copy()
cisplatin_secdscreen_filter

# Display unique values in the 'Primary disease' column
unique_primary_disease = cisplatin_secdscreen_filter['Primary disease'].unique()
print("Unique values in 'Primary disease':")
print(unique_primary_disease)

# Display unique values in the 'Disease subtype' column
unique_disease_subtype = cisplatin_secdscreen_filter['Disease subtype'].unique()
print("\nUnique values in 'Disease subtype':")
print(unique_disease_subtype)

secdscreen_hnscc = cisplatin_secdscreen_filter[
    (cisplatin_secdscreen_filter['Primary disease'] == 'Head and Neck Cancer') &
    (cisplatin_secdscreen_filter['Disease subtype'].str.contains('Squamous Cell Carcinoma', na=False))
]

secdscreen_hnscc


In [None]:
##processing ccle 1019 rna expression data
## getting rna expression data with cisplatin ic50 values


import pandas as pd
file_path = "/content/drive/MyDrive/data/CCLE_RNAseq_rsem_genes_tpm_20180929.txt.gz"
chunk_list = []
chunk_size = 10
for chunk in pd.read_csv(file_path, chunksize=chunk_size, compression='gzip',sep="\t"):
    chunk = chunk.drop(columns=["transcript_ids"], errors="ignore")
    merged_chunk = pd.merge(gtf_data, chunk, on="gene_id", how="right")
    merged_chunk = merged_chunk[["gene_name"] + [col for col in merged_chunk.columns if col != "gene_name"]]
    #transposed_chunk = merged_chunk.T
    chunk_list.append(merged_chunk)

ccle_df = pd.concat(chunk_list, axis=0)
ccle_df = ccle_df.reset_index(drop=True)
ccle_df = ccle_df.T
ccle_df = ccle_df.rename(columns=ccle_df.iloc[0])
ccle_df = ccle_df.reset_index()
ccle_df.rename(columns={'index': 'ccle_name'}, inplace=True)
#secdscreen_hnscc
secdscreen_ccle_df = pd.merge(ccle_df, secdscreen_hnscc, on="ccle_name", how="outer")
secdscreen_ccle_df.dropna()
secdscreen_ccle_df



In [None]:
#getting expression matrix to call out genes differentially expressed between sensitive vs resistant cell lines
###not completed yet --> key error


metadata_cols = ['ic50', 'name', 'cell_line_name', 'cancer_type', 'Primary disease',
                 'Disease subtype', 'Primary/metastasis', 'LNIC50']

gene_expr_cols = [col for col in secdscreen_ccle_df.columns if col not in metadata_cols]

secdscreen_sample_expr = secdscreen_ccle_df[gene_expr_cols].T
secdscreen_sample_expr.columns = secdscreen_ccle_df['cell_line_name']


# Add a `class` column based on `LNIC50` (e.g., thresholding `LNIC50`)
threshold = secdscreen_ccle_df['ic50'].median()  # Example: median as threshold
secdscreen_ccle_df['class'] = ['sensitive' if val <= threshold else 'resistant'
                               for val in secdscreen_ccle_df['ic50']]

secdscreen_sample_data = secdscreen_ccle_df[['cell_line_name', 'class']].set_index('cell_line_name')
secdscreen_sample_expr = secdscreen_sample_expr.drop(index='ccle_name', errors='ignore')
secdscreen_sample_expr.index.names = ['gene']
secdscreen_sample_expr



In [None]:
#4 additional cell lines (BHY, BICR22, BICR31 and FADU) from gdsc dataframe that have available cisplatin ic50 values
#- need time to sort out this part of the code
#- one cell line from here overlaps with the above 14 cell lines