In [1]:
import pandas as pd
import numpy as np
import os




In [2]:
folder_path = '../../Data/sclc_ucologne_2015/unused_data'
file_name = 'data_mrna_seq_fpkm.txt'
file_path = folder_path + '/' + file_name
# Load the RNAseq data from the csv file
data = pd.read_csv(file_path, sep='\t', index_col=0)

# Get the column names
data.head()
# the columns correspond to the patient IDs

Unnamed: 0_level_0,Entrez_Gene_Id,sclc_ucologne_2015_S00022,sclc_ucologne_2015_S00035,sclc_ucologne_2015_S00050,sclc_ucologne_2015_S00213,sclc_ucologne_2015_S00356,sclc_ucologne_2015_S00472,sclc_ucologne_2015_S00501,sclc_ucologne_2015_S00825,sclc_ucologne_2015_S00827,...,sclc_ucologne_2015_S02351,sclc_ucologne_2015_S02352,sclc_ucologne_2015_S02353,sclc_ucologne_2015_S02354,sclc_ucologne_2015_S02360,sclc_ucologne_2015_S02375,sclc_ucologne_2015_S02376,sclc_ucologne_2015_S02378,sclc_ucologne_2015_S02382,sclc_ucologne_2015_S02397
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,,14.0824,5.46565,3.70024,5.69252,4.90083,6.65977,4.99368,6.22679,4.15124,...,5.60387,5.33542,9.83929,6.43561,6.09864,1.86425,1.32476,2.92593,1.77078,1.9393
A1CF,,0.010387,0.005099,0.002786,0.00199,0.0,0.0,0.0,0.0,0.002411,...,0.0,0.010367,0.008859,0.02677,0.0,0.0,0.012848,0.010893,0.004062,0.020594
A1CF,,0.000402,0.000422,0.000231,0.000331,0.0,0.0,0.0,0.0,0.0002,...,0.0,0.000395,0.000481,0.000502,0.0,0.0,0.000394,0.000417,0.000335,0.000395
A1CF,,0.0,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019422
A1CF,,0.0,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# First, we'll filter out genes with zero expression across all patient samples

df_filtered = data.loc[~(data.iloc[:, 2:].eq(0).all(axis=1))]



# Next, resolve duplicates by averaging the FPKM values across duplicates

# Group by 'Hugo_Symbol', take the mean of each group, and reset the index

df_filtered = df_filtered.groupby('Hugo_Symbol').mean().reset_index()



# Display the shape of the original and the filtered DataFrame

original_shape = data.shape

filtered_shape = df_filtered.shape



original_shape, filtered_shape


((31640, 82), (18692, 83))

In [4]:
# Calculate the sum of FPKM for each sample (each column starting from the third column)
sum_fpkm_per_sample = df_filtered.iloc[:, 2:].sum()

# Convert FPKM to TPM
# Divide each FPKM value by the sum of FPKM for that sample, then multiply by 10^6
df_tpm = df_filtered.copy()
df_tpm.iloc[:, 2:] = (df_filtered.iloc[:, 2:].div(sum_fpkm_per_sample)) * 10**6

df_tpm.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,sclc_ucologne_2015_S00022,sclc_ucologne_2015_S00035,sclc_ucologne_2015_S00050,sclc_ucologne_2015_S00213,sclc_ucologne_2015_S00356,sclc_ucologne_2015_S00472,sclc_ucologne_2015_S00501,sclc_ucologne_2015_S00825,...,sclc_ucologne_2015_S02351,sclc_ucologne_2015_S02352,sclc_ucologne_2015_S02353,sclc_ucologne_2015_S02354,sclc_ucologne_2015_S02360,sclc_ucologne_2015_S02375,sclc_ucologne_2015_S02376,sclc_ucologne_2015_S02378,sclc_ucologne_2015_S02382,sclc_ucologne_2015_S02397
0,A1BG,,49.841299,15.273086,14.565863,17.514792,17.393426,25.719279,16.251096,19.708647,...,22.752054,24.284115,57.793733,30.590474,20.987337,4.758543,3.586691,10.495289,7.68113,7.475808
1,A1CF,,0.009546,0.003857,0.002969,0.001788,0.0,0.0,0.0,0.0,...,0.0,0.012246,0.013715,0.032408,0.0,0.0,0.008963,0.010142,0.004768,0.038945
2,A2LD1,,2.319991,1.953781,27.163376,1.837371,2.598952,4.515009,5.277865,4.604362,...,9.079738,5.421795,6.572453,4.581608,11.848325,2.395508,4.724555,2.982294,12.265743,4.752917
3,A2M,,99.542444,110.845385,179.958226,265.717774,446.910492,201.975527,334.034463,885.358605,...,133.800935,359.908678,282.70752,272.904387,105.083973,146.639685,110.043109,590.766883,318.268637,299.248969
4,A2ML1,,0.0,0.0,0.0,0.0,0.0,0.02981,0.0,0.0,...,0.0,0.078313,0.0,0.0,0.0,0.0,0.0,0.036759,0.0,0.0


In [5]:
# Drop the 'Entrez_Gene_Id' column
df_tpm = df_tpm.drop('Entrez_Gene_Id', axis=1)
#change name of column Hugo Symbol to gene gene_id
#df_tpm = df_tpm.rename(columns={'Hugo_Symbol': 'gene_id'})
df_tpm = df_tpm.rename(columns={'Hugo_Symbol': 'PATIENT_ID'})

In [6]:
# transpose the dataframe
df_tpm = df_tpm.T
df_tpm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18682,18683,18684,18685,18686,18687,18688,18689,18690,18691
PATIENT_ID,A1BG,A1CF,A2LD1,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
sclc_ucologne_2015_S00022,49.841299,0.009546,2.319991,99.542444,0.0,22.210035,0.107257,46.724696,319.758849,0.0,...,29.22103,130.85972,5.270138,9.826973,46.252081,2.238622,32.125702,14.753678,22.122403,73.809888
sclc_ucologne_2015_S00035,15.273086,0.003857,1.953781,110.845385,0.0,27.477135,0.088028,25.159499,99.545763,0.245483,...,30.892954,208.697196,5.149757,11.938863,12.83583,3.255952,19.19287,39.743821,6.103813,36.550729
sclc_ucologne_2015_S00050,14.565863,0.002969,27.163376,179.958226,0.0,15.833247,0.033027,29.036401,115.091591,0.0,...,70.625289,82.100384,6.213945,13.321429,35.216536,3.642674,42.220943,37.605044,17.160505,37.411764
sclc_ucologne_2015_S00213,17.514792,0.001788,1.837371,265.717774,0.0,15.509113,0.040869,21.387291,35.80051,0.547173,...,26.895888,71.968483,8.997664,24.542807,15.999187,1.208275,21.599901,36.54079,11.072079,47.033629


In [7]:
# Make the gene_id row the column names
df_tpm.columns = df_tpm.iloc[0]
#delete the gene_id row
df_tpm = df_tpm.drop('PATIENT_ID')

print(df_tpm.columns)
# change "gene_id" to "PATIENT_ID"
#df_tpm.rename(columns={'gene_id': 'PATIENT_ID'}, inplace=True)
df_tpm.head()

Index(['A1BG', 'A1CF', 'A2LD1', 'A2M', 'A2ML1', 'A4GALT', 'A4GNT', 'AAAS',
       'AACS', 'AADAC',
       ...
       'ZWILCH', 'ZWINT', 'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX',
       'ZZEF1', 'ZZZ3'],
      dtype='object', name='PATIENT_ID', length=18692)


PATIENT_ID,A1BG,A1CF,A2LD1,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
sclc_ucologne_2015_S00022,49.841299,0.009546,2.319991,99.542444,0.0,22.210035,0.107257,46.724696,319.758849,0.0,...,29.22103,130.85972,5.270138,9.826973,46.252081,2.238622,32.125702,14.753678,22.122403,73.809888
sclc_ucologne_2015_S00035,15.273086,0.003857,1.953781,110.845385,0.0,27.477135,0.088028,25.159499,99.545763,0.245483,...,30.892954,208.697196,5.149757,11.938863,12.83583,3.255952,19.19287,39.743821,6.103813,36.550729
sclc_ucologne_2015_S00050,14.565863,0.002969,27.163376,179.958226,0.0,15.833247,0.033027,29.036401,115.091591,0.0,...,70.625289,82.100384,6.213945,13.321429,35.216536,3.642674,42.220943,37.605044,17.160505,37.411764
sclc_ucologne_2015_S00213,17.514792,0.001788,1.837371,265.717774,0.0,15.509113,0.040869,21.387291,35.80051,0.547173,...,26.895888,71.968483,8.997664,24.542807,15.999187,1.208275,21.599901,36.54079,11.072079,47.033629
sclc_ucologne_2015_S00356,17.393426,0.0,2.598952,446.910492,0.0,7.808751,0.0,49.961838,45.578319,4.338144,...,75.27487,102.291016,7.584379,13.0649,24.19211,1.89158,26.275205,63.737706,16.234581,38.681041


In [8]:
#save the TPM data to a csv file
#df_tpm.to_csv('../../Data/sclc_ucologne_2015/data_mrna_seq_tpm.csv', index=True)

In [9]:
selected_genes =  ['TP53', 'RB1', 'TTN', 'RYR2', 'LRP1B', 'MUC16', 'ZFHX4', 'USH2A', 'CSMD3', 'NAV3', 'PCDH15', 'COL11A1', 'CSMD1', 'SYNE1', 'EYS', 'MUC17', 'ANKRD30B','FAM135B', 'FSIP2', 'TMEM132D']
df_tpm = df_tpm[selected_genes]
df_tpm.head()

PATIENT_ID,TP53,RB1,TTN,RYR2,LRP1B,MUC16,ZFHX4,USH2A,CSMD3,NAV3,PCDH15,COL11A1,CSMD1,SYNE1,EYS,MUC17,ANKRD30B,FAM135B,FSIP2,TMEM132D
sclc_ucologne_2015_S00022,15.77225,9.717008,0.045309,0.096664,0.047844,0.009974,19.107979,0.560155,0.006984,1.150887,0.001291,0.307634,0.215127,1.121043,0.197401,0.006134,0.058454,2.581555,2.778639,34.590621
sclc_ucologne_2015_S00035,8.229926,16.855989,0.102478,0.117015,0.085796,0.016076,1.583238,0.10857,0.262291,0.307544,0.004182,2.610052,3.225018,1.032223,5.438464,0.0,0.0,0.04136,0.043687,0.012527
sclc_ucologne_2015_S00050,11.35166,9.211642,0.051734,1.076788,0.758041,0.007456,0.934395,0.351121,0.050721,5.948195,0.017009,1.057358,3.183812,4.717105,0.465458,0.030543,0.628858,1.755718,0.012975,107.023807
sclc_ucologne_2015_S00213,2.868334,32.789853,0.028835,34.800547,0.039783,0.014907,0.972539,0.015912,0.0,0.448217,0.001795,8.01005,0.036811,1.102283,0.047909,0.0,0.0,0.00959,0.074788,3.34686
sclc_ucologne_2015_S00356,20.423414,66.005034,0.141325,0.331012,0.021003,0.0063,11.174626,0.050298,0.084654,4.321179,0.019437,0.602912,1.073575,2.239903,0.046164,0.0,0.0,0.394423,1.336221,5.193365


In [None]:
# save small subset of genes to a csv file
df_tpm.to_csv('../../Data/sclc_ucologne_2015/data_mrna_seq_tpm_small.csv', index=True)

In [None]:
# smaller data set for testing
#df_tpm.iloc[:1000, :].to_csv('../../Data/sclc_ucologne_2015/data_mrna_seq_tpm_small.tsv', index=False)


In [None]:
# merge with clinical data
clinical_data = pd.read_csv('/home/zow/prognosis-prediction/Data/sclc_ucologne_2015/data_clinical_patient.csv', sep='\t', index_col=0)
clinical_data.head()

Unnamed: 0_level_0,gender,ethnicity,race,year_of_diagnosis,year_of_birth,overall_survival,vital_status,disease_specific_survival,primary_site
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sclc_ucologne_2015_S00022,Male,,,47.0,1968.0,38.0,1:DECEASED,38.0,Bronchus and lung
sclc_ucologne_2015_S00035,Female,,,65.0,1950.0,12.0,1:DECEASED,12.0,Bronchus and lung
sclc_ucologne_2015_S00050,Male,,,47.0,1968.0,42.0,1:DECEASED,42.0,Bronchus and lung
sclc_ucologne_2015_S00213,Male,,,65.0,1950.0,13.0,1:DECEASED,13.0,Bronchus and lung
sclc_ucologne_2015_S00356,Female,,,54.0,1961.0,33.0,1:DECEASED,33.0,Bronchus and lung


In [81]:
merged_data = pd.merge(df_tpm, clinical_data, left_index=True, right_index=True)
merged_data.head()

Unnamed: 0,A1BG,A1CF,A2LD1,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZZZ3,gender,ethnicity,race,year_of_diagnosis,year_of_birth,overall_survival,vital_status,disease_specific_survival,primary_site
sclc_ucologne_2015_S00022,49.841299,0.009546,2.319991,99.542444,0.0,22.210035,0.107257,46.724696,319.758849,0.0,...,73.809888,Male,,,47.0,1968.0,38.0,1:DECEASED,38.0,Bronchus and lung
sclc_ucologne_2015_S00035,15.273086,0.003857,1.953781,110.845385,0.0,27.477135,0.088028,25.159499,99.545763,0.245483,...,36.550729,Female,,,65.0,1950.0,12.0,1:DECEASED,12.0,Bronchus and lung
sclc_ucologne_2015_S00050,14.565863,0.002969,27.163376,179.958226,0.0,15.833247,0.033027,29.036401,115.091591,0.0,...,37.411764,Male,,,47.0,1968.0,42.0,1:DECEASED,42.0,Bronchus and lung
sclc_ucologne_2015_S00213,17.514792,0.001788,1.837371,265.717774,0.0,15.509113,0.040869,21.387291,35.80051,0.547173,...,47.033629,Male,,,65.0,1950.0,13.0,1:DECEASED,13.0,Bronchus and lung
sclc_ucologne_2015_S00356,17.393426,0.0,2.598952,446.910492,0.0,7.808751,0.0,49.961838,45.578319,4.338144,...,38.681041,Female,,,54.0,1961.0,33.0,1:DECEASED,33.0,Bronchus and lung
