In [1]:
import pandas as pd
import numpy as np
import os




In [2]:
folder_path = '../../Data/sclc_ucologne_2015/unused_data'
file_name = 'data_mrna_seq_fpkm.txt'
file_path = folder_path + '/' + file_name
# Load the RNAseq data from the csv file
data = pd.read_csv(file_path, sep='\t', index_col=0)

# Get the column names
data.head()
# the columns correspond to the patient IDs

Unnamed: 0_level_0,Entrez_Gene_Id,sclc_ucologne_2015_S00022,sclc_ucologne_2015_S00035,sclc_ucologne_2015_S00050,sclc_ucologne_2015_S00213,sclc_ucologne_2015_S00356,sclc_ucologne_2015_S00472,sclc_ucologne_2015_S00501,sclc_ucologne_2015_S00825,sclc_ucologne_2015_S00827,...,sclc_ucologne_2015_S02351,sclc_ucologne_2015_S02352,sclc_ucologne_2015_S02353,sclc_ucologne_2015_S02354,sclc_ucologne_2015_S02360,sclc_ucologne_2015_S02375,sclc_ucologne_2015_S02376,sclc_ucologne_2015_S02378,sclc_ucologne_2015_S02382,sclc_ucologne_2015_S02397
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,,14.0824,5.46565,3.70024,5.69252,4.90083,6.65977,4.99368,6.22679,4.15124,...,5.60387,5.33542,9.83929,6.43561,6.09864,1.86425,1.32476,2.92593,1.77078,1.9393
A1CF,,0.010387,0.005099,0.002786,0.00199,0.0,0.0,0.0,0.0,0.002411,...,0.0,0.010367,0.008859,0.02677,0.0,0.0,0.012848,0.010893,0.004062,0.020594
A1CF,,0.000402,0.000422,0.000231,0.000331,0.0,0.0,0.0,0.0,0.0002,...,0.0,0.000395,0.000481,0.000502,0.0,0.0,0.000394,0.000417,0.000335,0.000395
A1CF,,0.0,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019422
A1CF,,0.0,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# First, we'll filter out genes with zero expression across all patient samples

df_filtered = data.loc[~(data.iloc[:, 2:].eq(0).all(axis=1))]



# Next, resolve duplicates by averaging the FPKM values across duplicates

# Group by 'Hugo_Symbol', take the mean of each group, and reset the index

df_filtered = df_filtered.groupby('Hugo_Symbol').mean().reset_index()



# Display the shape of the original and the filtered DataFrame

original_shape = data.shape

filtered_shape = df_filtered.shape



original_shape, filtered_shape


((31640, 82), (18692, 83))

In [4]:
# Calculate the sum of FPKM for each sample (each column starting from the third column)
sum_fpkm_per_sample = df_filtered.iloc[:, 2:].sum()

# Convert FPKM to TPM
# Divide each FPKM value by the sum of FPKM for that sample, then multiply by 10^6
df_tpm = df_filtered.copy()
df_tpm.iloc[:, 2:] = (df_filtered.iloc[:, 2:].div(sum_fpkm_per_sample)) * 10**6

df_tpm.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,sclc_ucologne_2015_S00022,sclc_ucologne_2015_S00035,sclc_ucologne_2015_S00050,sclc_ucologne_2015_S00213,sclc_ucologne_2015_S00356,sclc_ucologne_2015_S00472,sclc_ucologne_2015_S00501,sclc_ucologne_2015_S00825,...,sclc_ucologne_2015_S02351,sclc_ucologne_2015_S02352,sclc_ucologne_2015_S02353,sclc_ucologne_2015_S02354,sclc_ucologne_2015_S02360,sclc_ucologne_2015_S02375,sclc_ucologne_2015_S02376,sclc_ucologne_2015_S02378,sclc_ucologne_2015_S02382,sclc_ucologne_2015_S02397
0,A1BG,,49.841299,15.273086,14.565863,17.514792,17.393426,25.719279,16.251096,19.708647,...,22.752054,24.284115,57.793733,30.590474,20.987337,4.758543,3.586691,10.495289,7.68113,7.475808
1,A1CF,,0.009546,0.003857,0.002969,0.001788,0.0,0.0,0.0,0.0,...,0.0,0.012246,0.013715,0.032408,0.0,0.0,0.008963,0.010142,0.004768,0.038945
2,A2LD1,,2.319991,1.953781,27.163376,1.837371,2.598952,4.515009,5.277865,4.604362,...,9.079738,5.421795,6.572453,4.581608,11.848325,2.395508,4.724555,2.982294,12.265743,4.752917
3,A2M,,99.542444,110.845385,179.958226,265.717774,446.910492,201.975527,334.034463,885.358605,...,133.800935,359.908678,282.70752,272.904387,105.083973,146.639685,110.043109,590.766883,318.268637,299.248969
4,A2ML1,,0.0,0.0,0.0,0.0,0.0,0.02981,0.0,0.0,...,0.0,0.078313,0.0,0.0,0.0,0.0,0.0,0.036759,0.0,0.0


In [5]:
# Drop the 'Entrez_Gene_Id' column
df_tpm = df_tpm.drop('Entrez_Gene_Id', axis=1)
#change name of column Hugo Symbol to gene gene_id
df_tpm = df_tpm.rename(columns={'Hugo_Symbol': 'gene_id'})

In [6]:
#save the TPM data to a csv file
df_tpm.to_csv('../../Data/sclc_ucologne_2015/data_mrna_seq_tpm.tsv', index=False)

In [None]:
# smaller data set for testing
#df_tpm.iloc[:1000, :].to_csv('../../Data/sclc_ucologne_2015/data_mrna_seq_tpm_small.tsv', index=False)
