In [2]:
import numpy as np
from matplotlib import pyplot as plt
from numba import njit
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib as mpl
import pandas as pd

base_filepath = "/Users/44749/Documents/PhD_work/datasets_for_analysis"

In [2]:
#read in mutations and correct for clonality
df = pd.read_csv("C:/Users/44749/Documents/PhD_work/datasets_for_analysis/full_tx421primary_muttable_vaf_ccf_clonality.csv")

print("Table loaded")

filtered_df= df[df['PASS'] == True & (df['Timing_SC'] != "E")]

unique_tumours = filtered_df[['cruk_tumour_id']].drop_duplicates()



#translate table into dictionary

driver_mut_dict = {True:1, False:0}
all_tumour_names = []

for tumour in unique_tumours.values.tolist():
    tID = tumour[0]
    all_drivs_found = []
    clonal_appearances_of_drivers = {}
    all_tumour_names.append(tID)
    df_here = filtered_df[filtered_df['cruk_tumour_id'] == tID]
    unique_regions = df_here[['region']].drop_duplicates()
    #print(unique_regions)
    tumour_regions = [{} for m in range(len(unique_regions.values.tolist()))]
    #print(unique_regions)
    for n, region in enumerate(unique_regions.values.tolist()):
        regID = region[0]
        #print(tID, regID)
        relevant_muts = df_here[df_here['region']==regID]
        #print(len(relevant_muts))
        for index, row in relevant_muts.iterrows():
            mutID = str(row['chr']) + "." +str(row['start']) + "." + str(row['ref']) + "." + str(row['var']) + ":" + str(row['Hugo_Symbol']) + ":" + str(driver_mut_dict[row['DriverMut']])
            ccf = min(1, float(row['final_ccf'])) if row['cluster_clonality'] != 'clonal' else 1.0
            #print(tID, regID, mutID, ccf)
            #print(regID, ccf)
            if ccf > 0:
                tumour_regions[n][mutID] = ccf
                if row['DriverMut']:
                    all_drivs_found.append(row['Hugo_Symbol']) #add driver to list if it's found here, might be added more than once- record only the Hugo Symbol, NOT the specific mutation
                    #print(mutID)
                    if ccf == 1.0:
                        if row['Hugo_Symbol'] in clonal_appearances_of_drivers:
                            clonal_appearances_of_drivers[row['Hugo_Symbol']] += 1
                        else:
                            clonal_appearances_of_drivers[row['Hugo_Symbol']] = 1 #keep track of whether drivers are clonal
        # print(tumour_regions[n])
    np.save(tID+"_mutdict.npy", tumour_regions)

    #now check whether drivers are clonal or subclonal
    num_regions = len(unique_regions)
    all_drivs_found = np.unique(all_drivs_found)
    clonal_drivers = []
    subclonal_drivers = []
    
    for driv in all_drivs_found:
        if driv in clonal_appearances_of_drivers:
            print(driv, clonal_appearances_of_drivers[driv], num_regions)
            if clonal_appearances_of_drivers[driv] == num_regions: #found clonally everywhere
                clonal_drivers.append(driv)
        else:
            subclonal_drivers.append(driv)
    
    np.save(base_filepath+"/"+tID+"_driver_muts.npy", all_drivs_found)
    print(tID, "clonal drivers", clonal_drivers)
    np.save(base_filepath+"/"+tID+"_clonal_driver_muts.npy", clonal_drivers)
    np.save(base_filepath+"/"+tID+"_subclonal_driver_muts.npy", subclonal_drivers)

np.save("all_tumour_names.npy", all_tumour_names)
    

    

Table loaded
ARHGAP35 1 3
MGA 2 3
WRN 2 3
CRUK0001 clonal drivers []
EP300 1 3
MET 1 3
CRUK0002 clonal drivers []
PIK3CA 3 5
CRUK0003 clonal drivers []
SMAD4 1 4
CRUK0004 clonal drivers []
PASK 1 4
CRUK0005 clonal drivers []
FANCC 1 2
KEAP1 2 2
MAP3K1 1 2
PLCG2 2 2
PLXNB2 2 2
TP53 1 2
CRUK0006 clonal drivers ['KEAP1', 'PLCG2', 'PLXNB2']
U2AF1 2 2
CRUK0008 clonal drivers ['U2AF1']
ARHGAP35 1 4
KMT2C 3 4
CRUK0009 clonal drivers []
CHD8 2 2
CRUK0010 clonal drivers ['CHD8']
FLT4 1 3
KLF6 2 3
KRAS 2 3
NBN 4 3
CRUK0011 clonal drivers []
EGFR 2 2
CRUK0012 clonal drivers ['EGFR']
CRUK0013 clonal drivers []
KRAS 2 2
TP53 2 2
CRUK0014 clonal drivers ['KRAS', 'TP53']
BAP1 2 2
EGFR 1 2
CRUK0015 clonal drivers ['BAP1']
CBLB 2 2
FAT1 2 2
SPEN 2 2
CRUK0016 clonal drivers ['CBLB', 'FAT1', 'SPEN']


KeyboardInterrupt: 

In [3]:
tumour_names = np.load("all_tumour_names.npy", allow_pickle=True)
print(len(tumour_names))

402


In [None]:
#define a function that will do this for individual tumours

#calculate output summary statistics for a given number of samples, excluding clonal mutations
def calculate_output_stats_real_tumour(tumour_name):
    mutdict = np.load(tumour_name + "_mutdict.npy", allow_pickle=True)
    no_samples = len(mutdict)
    all_muts = []
    for dict in mutdict:
        all_muts += list(dict.keys()) #look at all mutations seen in this sample
    all_unique_mutations = np.unique(all_muts) #all mutations in all samples
    mut_matrix = np.zeros((len(all_unique_mutations), no_samples))
    for m, mut in enumerate(all_unique_mutations):
        for n, dict in enumerate(mutdict):
            if mut in dict:
                mut_matrix[m][n] = dict[mut] #zero by default
    #now proceed as with a simulated tumour, excluding clonal samples-- no read threshold
    min_ccfs = np.min(mut_matrix, axis=1)
    non_clonal_muts = np.where(min_ccfs < 1.0)[0] #not clonal everywhere
    mut_matrix = mut_matrix[non_clonal_muts] #filter out all of the clonal ones
    #now to actually calculate summary statistics!
    #0. all mutations detected subclonally (i.e. with CCF < 1.0 in at least one sample)
    no_muts = len(non_clonal_muts)
    #1. all mutations detected in 1 sample only
    appearances = np.count_nonzero(mut_matrix, axis=1)
    private = np.where(appearances == 1)[0]
    num_private = len(private)
    #2-no_samples, ... number fixed in 1, ... no_samples-1 samples
    fixed_muts = []
    for s in range(1, no_samples):
        fixed_muts.append(count_fixation_number(mut_matrix, s)) #count number fixed in s samples
    #no_samples+1: number of mutations fixed AND private
    private_ccfs = np.max(mut_matrix[private], axis=1) #the maximum CCF of a private mutation will be its CCF in that sample
    fixed_and_private = np.where(private_ccfs == 1.0)[0]
    num_fixed_private = len(fixed_and_private)
    #no_samples + 2-2*no_samples-1- the number of mutations appearing in 2, ... no_samples samples
    num_appearing = []
    for s in range(2, no_samples+1):
        num_appearing.append(len(np.where(appearances == s)[0]))
    return no_samples, [no_muts, num_private] + fixed_muts + [num_fixed_private] + num_appearing    
    

tIDs = np.load("all_tumour_names.npy", allow_pickle=True)

print(len(tIDs))

#assign tumours to a different bucket depending on the number of samples taken
stats_by_sample_number = [[] for s in range(2, 9)]
num_samples = {}
for tumour_name in tIDs:
    try:
        no_samples, stats = calculate_output_stats_real_tumour(tumour_name)
        if no_samples > 1:
            num_samples[tumour_name] = no_samples
            stats_by_sample_number[no_samples-2].append(stats)
            np.save(base_filepath+"/"+tumour_name+"_ss_4424.npy", stats)
            print(tumour_name, "found")
        else:
            print(tumour_name, "too few samples")
            
    except:
        print(tumour_name, "not found")

#print(overall_stats)

for n, stat_matrix in enumerate(stats_by_sample_number):
    np.save("cruk_"+str(n+2)+"_samples_stat.npy", stat_matrix)

np.save("num_samples_by_tumour_name.npy", num_samples)
#np.save("stat_index_by_tumour_name.npy", stat_index_dict)
#np.save("cruk_overall_stats.npy", overall_stats)



In [3]:
df = pd.read_excel("C:/Users/44749/Documents/PhD_work/datasets_for_analysis/tracerx_clinical_df.xlsx")
purity_df = pd.read_excel("C:/Users/44749/Documents/PhD_work/datasets_for_analysis/tx421_purity_ploidy.xlsx")
patient_df = pd.read_excel("C:/Users/44749/Documents/PhD_work/datasets_for_analysis/tracerx_patient_df.xlsx")

sex_codes = {'Male':0, 'Female':1}
histology_codes = {'LUAD':0, 'LUSC':1, 'Other':2}
smok_stat_codes = {'Smoker':0, 'Ex-Smoker':1, 'Never Smoked':2}

num_samples = np.load("num_samples_by_tumour_name.npy", allow_pickle=True).item()


patient_index_dict = {} #record where all patients are in this list
pcount = 0

clinical_dfs = []

unique_drivs = []
for name in list(num_samples.keys()):
    #load drivers
    drivers_here = np.load(base_filepath+"/"+name+"_driver_muts.npy", allow_pickle=True)
    #print(drivers_here)
    unique_drivs += list(drivers_here)

all_drivs = np.unique(unique_drivs)
#assign each an index
driv_index_dict = dict(zip(list(all_drivs), list(range(len(all_drivs)))))
np.save("all_driv_identities.npy", all_drivs)

num_drivs = len(all_drivs)
driver_matrix = []
    

for name in list(num_samples.keys()):
    try:
        drivers_here = np.load(base_filepath+"/"+name+"_driver_muts.npy", allow_pickle=True)
        clonal_drivers_here = np.load(base_filepath+"/"+name+"_clonal_driver_muts.npy", allow_pickle=True)
        subclonal_drivers_here = np.load(base_filepath+"/"+name+"_subclonal_driver_muts.npy", allow_pickle=True)
        #print(name, "clonal driver check", clonal_drivers_here, len(clonal_drivers_here))
        
        age = df[df['tumour_id_muttable_cruk']==name]['age']
        #print(name, age)
        #print(age.iloc[0]
        age = age.iloc[0]
    
        sex = df[df['tumour_id_muttable_cruk']==name]['clinical_sex']
        sex = str(sex.iloc[0])
        bin_sex = sex_codes[sex]
        #print(bin_sex)
        clinical_sex = bin_sex
    
        hist = str(df[df['tumour_id_muttable_cruk']==name]['histology_3'].iloc[0])
        int_hist = histology_codes[hist]
        histology = int_hist
    
        smok_stat = str(df[df['tumour_id_muttable_cruk']==name]['smoking_status_merged'].iloc[0])
        smoking_status = smok_stat_codes[smok_stat]

        pyears = df[df['tumour_id_muttable_cruk']==name]['pack_years']
        pack_years= int(pyears.iloc[0])

        #calculate average/minimum purity of all regions in the tumour
        av_pur = np.average(purity_df[purity_df['tumour_id']==name]['Purity'])
        min_pur = np.min(purity_df[purity_df['tumour_id']==name]['Purity'])

        #take disease free survival time

        dfs = patient_df[patient_df['tumour_id_muttable_cruk']==name]['dfs_time']
        cens_dfs = patient_df[patient_df['tumour_id_muttable_cruk']==name]['cens_dfs']
        dfs_any_event = patient_df[patient_df['tumour_id_muttable_cruk']==name]['dfs_time_any_event']
        cens_dfs_any_event = patient_df[patient_df['tumour_id_muttable_cruk']==name]['cens_dfs_any_event']
        dfs = float(dfs.iloc[0])
        cens_dfs = float(cens_dfs.iloc[0])
        dfs_any_event = float(dfs_any_event.iloc[0])
        cens_dfs_any_event = float(cens_dfs_any_event.iloc[0])
    
        patient_index_dict[name] = pcount
    
    
        clinical_dfs.append([age, clinical_sex, histology, smoking_status, pack_years, len(clonal_drivers_here), len(subclonal_drivers_here), av_pur, min_pur, dfs, cens_dfs, dfs_any_event, cens_dfs_any_event])
        
        driver_matrix_here = np.zeros(num_drivs, dtype=int)
        for driv in list(drivers_here):
            index = driv_index_dict[driv]
            driver_matrix_here[index] = 1 #mark the presence of all detected drivers
    
        driver_matrix.append(driver_matrix_here)

        

        

        pcount += 1
        

    except:
        print(name)

np.save("patient_index_dict.npy", patient_index_dict)
np.save("clinical_var_matrix.npy", clinical_dfs)
np.save("clinical_driver_matrix.npy", driver_matrix)



CRUK0030_Cluster2
CRUK0223_Cluster1
CRUK0223_Cluster2
CRUK0372_Cluster1
CRUK0372_Cluster2
CRUK0555_Cluster2
CRUK0586_Cluster1
CRUK0586_Cluster2
CRUK0620_Cluster1
CRUK0620_Cluster2
CRUK0704_Cluster1
CRUK0704_Cluster2
CRUK0704_Cluster3
CRUK0721_Cluster2
CRUK0881_Cluster1
CRUK0881_Cluster2


In [None]:
p