In [1]:
import os 
import numpy as np
import pandas as pd

This notebook shows how preprocessing is handled in the data.

We transform the data in order to make them compatible with float type, as well as filtering some clinical and histology features out, and keeping those that are needed in the dataset.

A bit more preprocessing can be found in the actual pipeline.

## Preparing Genetic data

In [7]:
# Assuming the CSV file is located in a folder named 'data' which is a subdirectory of the current directory
file_path = 'C://Users//inigo//Desktop//AAU//Cursos//4th semester//P10//Data//GeneticData.csv'

# Read the CSV file into a DataFrame
try:
    df = pd.read_csv(file_path, sep = ";", index_col = 0)
    print("CSV file successfully read.")  # Indicate successful reading
except FileNotFoundError:
    print("File not found. Please check the file path.")  # Handle file not found error

# Now you can use 'df' DataFrame for further analysis

CSV file successfully read.


In [8]:
df

Unnamed: 0_level_0,X00936b9285d6b8665ae9122993fb8e91,X105622fadc33f23755ac2df823110aca,Xe44f39747a8e84b02b4cb24659312144,X293dd1284496215e9a0eca9f17a98e7e,X01ed7190ce00862696edbf047b542045,Xf4a6a5a1450a8448882e1c0aebc0eea5,Xf7ee4a331a8fd70edc240dff978c170d,Xcddbd4d01bc7513298a5135e97cfdb6b,Xa57451efa2822becb256706fe939d078,Xff8fbe2fe76c82d429d4fa1d315f3b93,...,Xa7d2bd80340b4019986ea8d8c3638c75,Xbc7a029dc7ce4b19a5a2b78a7b2fc6ae,X13b911b56a330efd2660b67b1cf98466,X9388bd484d0e404c6732573b4228d2a2,Xc3d410d70dd7359baa40126494fb6765,X50772aa64efb859960b20f8801cd6f58,X91bcd3067a1a7954692d836515e04869,Xc7439a06ffa32b313b0ec1b987b992a2,X21a6043653d187f8bbead475d2f49791,Xa021f5de25a2ffa059870f059a65d075
HUGO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,291,961,275,217,144,152,211,215,226,241,...,001,129,276,113,08,088,133,174,18,212
A1BG-AS1,241,071,248,069,183,001,216,131,221,209,...,001,001,181,046,001,016,116,056,155,105
A1CF,471,599,146,001,235,001,001,001,153,001,...,487,332,278,448,513,416,001,12,314,341
A2M,1078,112,1118,1089,1011,818,1174,865,977,1049,...,891,915,995,837,752,1006,996,896,1025,105
A2M-AS1,202,045,2,157,233,293,244,12,186,236,...,188,14,258,189,271,169,164,064,169,116
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,001,132,001,043,001,345,001,005,149,041,...,419,254,229,001,34,33,001,021,001,062
ZYG11B,412,436,431,416,445,376,455,328,42,4,...,425,367,4,361,38,457,406,434,439,396
ZYX,823,766,911,866,828,866,909,913,826,844,...,789,825,897,907,753,823,817,793,907,906
ZZEF1,545,539,578,604,556,538,611,511,583,599,...,567,496,577,497,557,551,581,51,592,54


In [9]:
df_transposed = df.transpose()
df_transposed = df_transposed.replace(',', '.', regex=True)
df_transposed = df_transposed.astype(float)
df_transposed

HUGO,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
X00936b9285d6b8665ae9122993fb8e91,2.91,2.41,4.71,10.78,2.02,0.01,0.01,7.18,0.01,6.22,...,3.18,3.96,3.82,3.83,2.86,0.01,4.12,8.23,5.45,5.26
X105622fadc33f23755ac2df823110aca,9.61,0.71,5.99,11.20,0.45,0.01,0.70,3.93,0.01,5.67,...,2.62,2.59,3.64,4.10,2.86,1.32,4.36,7.66,5.39,5.03
Xe44f39747a8e84b02b4cb24659312144,2.75,2.48,1.46,11.18,2.00,0.01,1.37,4.95,0.01,6.32,...,3.07,2.89,4.18,3.87,3.59,0.01,4.31,9.11,5.78,5.26
X293dd1284496215e9a0eca9f17a98e7e,2.17,0.69,0.01,10.89,1.57,0.01,0.49,5.14,1.62,6.36,...,3.04,3.18,4.40,3.93,3.92,0.43,4.16,8.66,6.04,5.13
X01ed7190ce00862696edbf047b542045,1.44,1.83,2.35,10.11,2.33,0.01,0.27,6.05,0.01,6.49,...,2.43,2.51,4.19,3.89,3.34,0.01,4.45,8.28,5.56,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X50772aa64efb859960b20f8801cd6f58,0.88,0.16,4.16,10.06,1.69,0.01,0.01,6.18,0.01,5.91,...,3.06,2.84,4.19,3.68,3.26,3.30,4.57,8.23,5.51,5.66
X91bcd3067a1a7954692d836515e04869,1.33,1.16,0.01,9.96,1.64,0.01,1.35,5.57,0.01,6.14,...,4.06,4.12,3.38,3.47,3.60,0.01,4.06,8.17,5.81,5.26
Xc7439a06ffa32b313b0ec1b987b992a2,1.74,0.56,1.20,8.96,0.64,0.01,0.01,5.98,0.01,6.16,...,3.01,3.77,4.44,4.30,2.92,0.21,4.34,7.93,5.10,5.02
X21a6043653d187f8bbead475d2f49791,1.80,1.55,3.14,10.25,1.69,0.01,3.23,6.59,0.01,6.18,...,3.82,4.24,4.14,3.52,3.06,0.01,4.39,9.07,5.92,5.93


##### Saving

In [10]:
df_transposed.to_csv('output_GeneticData.csv', index=True)

##### Loading

In [11]:
# Assuming the CSV file is located in a folder named 'data' which is a subdirectory of the current directory
file_path = 'output_GeneticData.csv'

# Read the CSV file into a DataFrame
try:
    df_genetic = pd.read_csv(file_path, sep = ",", index_col = 0)
    print("CSV file successfully read.")  # Indicate successful reading
except FileNotFoundError:
    print("File not found. Please check the file path.")  # Handle file not found error

# Now you can use 'df' DataFrame for further analysis

CSV file successfully read.


In [12]:
df_genetic

Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
X00936b9285d6b8665ae9122993fb8e91,2.91,2.41,4.71,10.78,2.02,0.01,0.01,7.18,0.01,6.22,...,3.18,3.96,3.82,3.83,2.86,0.01,4.12,8.23,5.45,5.26
X105622fadc33f23755ac2df823110aca,9.61,0.71,5.99,11.20,0.45,0.01,0.70,3.93,0.01,5.67,...,2.62,2.59,3.64,4.10,2.86,1.32,4.36,7.66,5.39,5.03
Xe44f39747a8e84b02b4cb24659312144,2.75,2.48,1.46,11.18,2.00,0.01,1.37,4.95,0.01,6.32,...,3.07,2.89,4.18,3.87,3.59,0.01,4.31,9.11,5.78,5.26
X293dd1284496215e9a0eca9f17a98e7e,2.17,0.69,0.01,10.89,1.57,0.01,0.49,5.14,1.62,6.36,...,3.04,3.18,4.40,3.93,3.92,0.43,4.16,8.66,6.04,5.13
X01ed7190ce00862696edbf047b542045,1.44,1.83,2.35,10.11,2.33,0.01,0.27,6.05,0.01,6.49,...,2.43,2.51,4.19,3.89,3.34,0.01,4.45,8.28,5.56,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X50772aa64efb859960b20f8801cd6f58,0.88,0.16,4.16,10.06,1.69,0.01,0.01,6.18,0.01,5.91,...,3.06,2.84,4.19,3.68,3.26,3.30,4.57,8.23,5.51,5.66
X91bcd3067a1a7954692d836515e04869,1.33,1.16,0.01,9.96,1.64,0.01,1.35,5.57,0.01,6.14,...,4.06,4.12,3.38,3.47,3.60,0.01,4.06,8.17,5.81,5.26
Xc7439a06ffa32b313b0ec1b987b992a2,1.74,0.56,1.20,8.96,0.64,0.01,0.01,5.98,0.01,6.16,...,3.01,3.77,4.44,4.30,2.92,0.21,4.34,7.93,5.10,5.02
X21a6043653d187f8bbead475d2f49791,1.80,1.55,3.14,10.25,1.69,0.01,3.23,6.59,0.01,6.18,...,3.82,4.24,4.14,3.52,3.06,0.01,4.39,9.07,5.92,5.93


------------------------

## Preparing clinical data

In [32]:
# Assuming the CSV file is located in a folder named 'data' which is a subdirectory of the current directory
file_path = 'C://Users//inigo//Desktop//AAU//Cursos//4th semester//P10//Data//ClinicalData.csv'

# Read the CSV file into a DataFrame
try:
    df = pd.read_csv(file_path, sep = ";", index_col = 0)
    print("CSV file successfully read.")  # Indicate successful reading
except FileNotFoundError:
    print("File not found. Please check the file path.")  # Handle file not found error

# Now you can use 'df' DataFrame for further analysis

CSV file successfully read.


In [33]:
df

Unnamed: 0_level_0,PFS_P,PFS_P_CNSR,AGE,SEX,TRT01P,PDL1FL,TCGA_cluster
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
X00936b9285d6b8665ae9122993fb8e91,41724846,0.0,52,F,Avelumab+Axitinib,N,m1
X105622fadc33f23755ac2df823110aca,1659137577,1.0,78,M,Sunitinib,N,m1
Xeb5718fc91df508b1eb43b77df50a828,8706365503,0.0,52,F,Avelumab+Axitinib,,
Xe44f39747a8e84b02b4cb24659312144,1110472279,0.0,61,M,Sunitinib,Y,m2
X293dd1284496215e9a0eca9f17a98e7e,1402874743,1.0,55,M,Sunitinib,Y,m2
...,...,...,...,...,...,...,...
Xc7439a06ffa32b313b0ec1b987b992a2,650513347,1.0,43,M,Avelumab+Axitinib,N,m1
Xec760e642aeb6c4b26c912213d84f20b,6866529774,1.0,65,F,Avelumab+Axitinib,N,
X21a6043653d187f8bbead475d2f49791,5683778234,1.0,79,M,Sunitinib,N,m3
Xa021f5de25a2ffa059870f059a65d075,4106776181,1.0,73,F,Avelumab+Axitinib,Y,m4


In [34]:
# Replace for conversion to float from string
df['PFS_P'] = df['PFS_P'].replace(',', '.', regex=True)
df['PFS_P'] = df['PFS_P'].astype(float)

# Replace to 0 and 1 and keep NaN intact
df['PFS_P_CNSR'] = df['PFS_P_CNSR'].replace({0.0: 0, 1.0: 1})
df['PFS_P_CNSR'] = df['PFS_P_CNSR'].astype(pd.Int64Dtype())

df['PDL1FL'] = df['PDL1FL'].replace({'N': 0, 'Y': 1})
df['PDL1FL'] = df['PDL1FL'].astype(pd.Int64Dtype())

df

Unnamed: 0_level_0,PFS_P,PFS_P_CNSR,AGE,SEX,TRT01P,PDL1FL,TCGA_cluster
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
X00936b9285d6b8665ae9122993fb8e91,4.172485,0,52,F,Avelumab+Axitinib,0,m1
X105622fadc33f23755ac2df823110aca,16.591376,1,78,M,Sunitinib,0,m1
Xeb5718fc91df508b1eb43b77df50a828,8.706366,0,52,F,Avelumab+Axitinib,,
Xe44f39747a8e84b02b4cb24659312144,11.104723,0,61,M,Sunitinib,1,m2
X293dd1284496215e9a0eca9f17a98e7e,14.028747,1,55,M,Sunitinib,1,m2
...,...,...,...,...,...,...,...
Xc7439a06ffa32b313b0ec1b987b992a2,6.505133,1,43,M,Avelumab+Axitinib,0,m1
Xec760e642aeb6c4b26c912213d84f20b,6.866530,1,65,F,Avelumab+Axitinib,0,
X21a6043653d187f8bbead475d2f49791,5.683778,1,79,M,Sunitinib,0,m3
Xa021f5de25a2ffa059870f059a65d075,4.106776,1,73,F,Avelumab+Axitinib,1,m4


In [35]:
df_nocli = df[['PFS_P', 'PFS_P_CNSR', 'TRT01P']]

index_gen = list(df_genetic.index)
index_cli = list(df_nocli.index)
inner_join = [x for x in index_gen if x in index_cli]

df_clinical = df_nocli[df_nocli.index.isin(inner_join)]
df_clinical.to_csv('output_ClinicalData.csv', index=True)

df_clinical

Unnamed: 0_level_0,PFS_P,PFS_P_CNSR,TRT01P
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
X00936b9285d6b8665ae9122993fb8e91,4.172485,0,Avelumab+Axitinib
X105622fadc33f23755ac2df823110aca,16.591376,1,Sunitinib
Xe44f39747a8e84b02b4cb24659312144,11.104723,0,Sunitinib
X293dd1284496215e9a0eca9f17a98e7e,14.028747,1,Sunitinib
X01ed7190ce00862696edbf047b542045,12.418891,0,Sunitinib
...,...,...,...
X50772aa64efb859960b20f8801cd6f58,4.271047,0,Sunitinib
X91bcd3067a1a7954692d836515e04869,2.496920,0,Sunitinib
Xc7439a06ffa32b313b0ec1b987b992a2,6.505133,1,Avelumab+Axitinib
X21a6043653d187f8bbead475d2f49791,5.683778,1,Sunitinib


#### Adding MATH scores and Histology data

In [36]:
# Assuming the CSV file is located in a folder named 'data' which is a subdirectory of the current directory
file_path = '../Data/MATH_scores.csv'

# Read the CSV file into a DataFrame
try:
    df_math = pd.read_csv(file_path, sep = ";", index_col = 0)
    print("CSV file successfully read.")  # Indicate successful reading
except FileNotFoundError:
    print("File not found. Please check the file path.")  # Handle file not found error

# Now you can use 'df' DataFrame for further analysis

CSV file successfully read.


In [37]:
# Assuming the CSV file is located in a folder named 'data' which is a subdirectory of the current directory
file_path = '../Data/Histology_data.csv'

# Read the CSV file into a DataFrame
try:
    df_hist = pd.read_csv(file_path, sep = ";", index_col = 0)
    print("CSV file successfully read.")  # Indicate successful reading
except FileNotFoundError:
    print("File not found. Please check the file path.")  # Handle file not found error

# Now you can use 'df' DataFrame for further analysis

CSV file successfully read.


In [38]:
df_math['MATH'] = df_math['MATH'].str.replace(',', '.')
df_math['MATH'] = df_math['MATH'].astype(float)
df_math

Unnamed: 0_level_0,MATH
ANONID,Unnamed: 1_level_1
X00936b9285d6b8665ae9122993fb8e91,17.928391
X105622fadc33f23755ac2df823110aca,16.122090
Xe44f39747a8e84b02b4cb24659312144,23.616637
X293dd1284496215e9a0eca9f17a98e7e,24.817435
X01ed7190ce00862696edbf047b542045,19.303863
...,...
X91bcd3067a1a7954692d836515e04869,27.837848
Xc7439a06ffa32b313b0ec1b987b992a2,26.606826
X21a6043653d187f8bbead475d2f49791,17.627516
Xa021f5de25a2ffa059870f059a65d075,16.266771


In [42]:
pd.merge(df_math, df_clinical, left_index = True, right_index = True, how = 'inner')


Unnamed: 0,MATH,PFS_P,PFS_P_CNSR,TRT01P
X00936b9285d6b8665ae9122993fb8e91,17.928391,4.172485,0,Avelumab+Axitinib
X105622fadc33f23755ac2df823110aca,16.122090,16.591376,1,Sunitinib
Xe44f39747a8e84b02b4cb24659312144,23.616637,11.104723,0,Sunitinib
X293dd1284496215e9a0eca9f17a98e7e,24.817435,14.028747,1,Sunitinib
X01ed7190ce00862696edbf047b542045,19.303863,12.418891,0,Sunitinib
...,...,...,...,...
X50772aa64efb859960b20f8801cd6f58,15.672304,4.271047,0,Sunitinib
X91bcd3067a1a7954692d836515e04869,27.837848,2.496920,0,Sunitinib
Xc7439a06ffa32b313b0ec1b987b992a2,26.606826,6.505133,1,Avelumab+Axitinib
X21a6043653d187f8bbead475d2f49791,17.627516,5.683778,1,Sunitinib


In [43]:
# merging clinical and MATH makes us lose around 30/40 patients
#merged_df = pd.merge(df_math, df_clinical, left_index=True, right_index=True, how='inner')
clinical_df = pd.merge(df_clinical, df_hist, left_index=True, right_index=True, how='inner')
cols = ['CD8_INVASIVE_MARGIN_SURFACE_AREA', 'CD8_POSITIVE_CELLS_TUMOR_CENTER', 'CD8_POSITIVE_CELLS_INVASIVE_MARGIN', 'CD8_POSITIVE_CELLS_TOTAL_AREA']
colsToRemove = ['CD8_INVASIVE_MARGIN_SURFACE_AREA', 'CD8_POSITIVE_CELLS_TUMOR_CENTER', 'CD8_POSITIVE_CELLS_INVASIVE_MARGIN', 'CD8_POSITIVE_CELLS_TOTAL_AREA'] # over 50% missing values
for i in cols:
    clinical_df[i] = clinical_df[i].str.replace(',', '.')
    clinical_df[i] = clinical_df[i].astype(float)
# We first drop those columns with +50% missing, then do dropna to remove those patients that don't have a value in CD8 positive tumor center (5% missing)
clinical_df = clinical_df.drop(columns = colsToRemove)
#clinical_df = clinical_df.dropna()
clinical_df

Unnamed: 0_level_0,PFS_P,PFS_P_CNSR,TRT01P,HE_TUMOR_CELL_CONTENT_IN_TUMOR_AREA,PD-L1_TOTAL_IMMUNE_CELLS_PER_TUMOR_AREA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
X00936b9285d6b8665ae9122993fb8e91,4.172485,0,Avelumab+Axitinib,70.0,0.0
X105622fadc33f23755ac2df823110aca,16.591376,1,Sunitinib,85.0,1.0
Xe44f39747a8e84b02b4cb24659312144,11.104723,0,Sunitinib,80.0,5.0
X293dd1284496215e9a0eca9f17a98e7e,14.028747,1,Sunitinib,60.0,5.0
X01ed7190ce00862696edbf047b542045,12.418891,0,Sunitinib,80.0,2.0
...,...,...,...,...,...
X50772aa64efb859960b20f8801cd6f58,4.271047,0,Sunitinib,75.0,1.0
X91bcd3067a1a7954692d836515e04869,2.496920,0,Sunitinib,50.0,1.0
Xc7439a06ffa32b313b0ec1b987b992a2,6.505133,1,Avelumab+Axitinib,80.0,1.0
X21a6043653d187f8bbead475d2f49791,5.683778,1,Sunitinib,70.0,0.0


##### Saving

In [44]:
clinical_df.to_csv('output_ClinicalData.csv', index=True)

##### Loading

In [58]:
# Assuming the CSV file is located in a folder named 'data' which is a subdirectory of the current directory
file_path = 'output_ClinicalDataWithHistology.csv'

# Read the CSV file into a DataFrame
try:
    df_clinical = pd.read_csv(file_path, sep = ",", index_col = 0)
    print("CSV file successfully read.")  # Indicate successful reading
except FileNotFoundError:
    print("File not found. Please check the file path.")  # Handle file not found error

# Now you can use 'df' DataFrame for further analysis

CSV file successfully read.


In [59]:
df_clinical

Unnamed: 0,MATH,PFS_P,PFS_P_CNSR,AGE,SEX,TRT01P,PDL1FL,TCGA_cluster,HE_TUMOR_CELL_CONTENT_IN_TUMOR_AREA,CD8_POSITIVE_CELLS_TUMOR_CENTER,PD-L1_TOTAL_IMMUNE_CELLS_PER_TUMOR_AREA,CD8_POSITIVE_CELLS_TOTAL_AREA
X00936b9285d6b8665ae9122993fb8e91,17.928391,4.172485,0,52,F,Avelumab+Axitinib,0,m1,70.0,0.08,0.0,0.1931
X105622fadc33f23755ac2df823110aca,16.122090,16.591376,1,78,M,Sunitinib,0,m1,85.0,0.12,1.0,0.1214
Xe44f39747a8e84b02b4cb24659312144,23.616637,11.104723,0,61,M,Sunitinib,1,m2,80.0,0.92,5.0,0.9203
X293dd1284496215e9a0eca9f17a98e7e,24.817435,14.028747,1,55,M,Sunitinib,1,m2,60.0,3.16,5.0,3.1635
X01ed7190ce00862696edbf047b542045,19.303863,12.418891,0,68,M,Sunitinib,1,m1,80.0,1.98,2.0,2.0708
...,...,...,...,...,...,...,...,...,...,...,...,...
Xc3d410d70dd7359baa40126494fb6765,24.552611,9.790554,1,49,M,Sunitinib,0,m1,75.0,1.01,0.0,1.0089
X50772aa64efb859960b20f8801cd6f58,15.672304,4.271047,0,65,M,Sunitinib,0,m1,75.0,1.10,1.0,1.1775
X91bcd3067a1a7954692d836515e04869,27.837848,2.496920,0,57,M,Sunitinib,1,m3,50.0,4.03,1.0,3.9642
Xc7439a06ffa32b313b0ec1b987b992a2,26.606826,6.505133,1,43,M,Avelumab+Axitinib,0,m1,80.0,0.14,1.0,0.1417


---------------------

In [60]:
index_gen = list(df_genetic.index)
index_cli = list(df_clinical.index)
inner_join = [x for x in index_gen if x in index_cli]

left_over = [x for x in index_cli if x not in inner_join]

print(len(inner_join), "are in both genetic and clinical table while", len(left_over), "are not")

650 are in both genetic and clinical table while 0 are not


In [61]:
df_genetic = df_genetic[df_genetic.index.isin(inner_join)]
df_clinical = df_clinical[df_clinical.index.isin(inner_join)]

In [62]:
df_genetic.to_csv('output_GeneticDataWithHistology.csv', index=True)
df_clinical.to_csv('output_ClinicalDataWithHistology.csv', index=True)

--------------------

In [45]:
# Assuming the CSV file is located in a folder named 'data' which is a subdirectory of the current directory
file_path = 'output_ClinicalData.csv'

# Read the CSV file into a DataFrame
try:
    df_clinical = pd.read_csv(file_path, sep = ",", index_col = 0)
    print("CSV file successfully read.")  # Indicate successful reading
except FileNotFoundError:
    print("File not found. Please check the file path.")  # Handle file not found error

# Now you can use 'df' DataFrame for further analysis

CSV file successfully read.


In [47]:
# Assuming the CSV file is located in a folder named 'data' which is a subdirectory of the current directory
file_path = 'output_GeneticData.csv'

# Read the CSV file into a DataFrame
try:
    df_genetic = pd.read_csv(file_path, sep = ",", index_col = 0)
    print("CSV file successfully read.")  # Indicate successful reading
except FileNotFoundError:
    print("File not found. Please check the file path.")  # Handle file not found error

# Now you can use 'df' DataFrame for further analysis

CSV file successfully read.


In [46]:
df_clinical

Unnamed: 0_level_0,PFS_P,PFS_P_CNSR,TRT01P,HE_TUMOR_CELL_CONTENT_IN_TUMOR_AREA,PD-L1_TOTAL_IMMUNE_CELLS_PER_TUMOR_AREA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
X00936b9285d6b8665ae9122993fb8e91,4.172485,0,Avelumab+Axitinib,70.0,0.0
X105622fadc33f23755ac2df823110aca,16.591376,1,Sunitinib,85.0,1.0
Xe44f39747a8e84b02b4cb24659312144,11.104723,0,Sunitinib,80.0,5.0
X293dd1284496215e9a0eca9f17a98e7e,14.028747,1,Sunitinib,60.0,5.0
X01ed7190ce00862696edbf047b542045,12.418891,0,Sunitinib,80.0,2.0
...,...,...,...,...,...
X50772aa64efb859960b20f8801cd6f58,4.271047,0,Sunitinib,75.0,1.0
X91bcd3067a1a7954692d836515e04869,2.496920,0,Sunitinib,50.0,1.0
Xc7439a06ffa32b313b0ec1b987b992a2,6.505133,1,Avelumab+Axitinib,80.0,1.0
X21a6043653d187f8bbead475d2f49791,5.683778,1,Sunitinib,70.0,0.0


In [48]:
df_genetic

Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
X00936b9285d6b8665ae9122993fb8e91,2.91,2.41,4.71,10.78,2.02,0.01,0.01,7.18,0.01,6.22,...,3.18,3.96,3.82,3.83,2.86,0.01,4.12,8.23,5.45,5.26
X105622fadc33f23755ac2df823110aca,9.61,0.71,5.99,11.20,0.45,0.01,0.70,3.93,0.01,5.67,...,2.62,2.59,3.64,4.10,2.86,1.32,4.36,7.66,5.39,5.03
Xe44f39747a8e84b02b4cb24659312144,2.75,2.48,1.46,11.18,2.00,0.01,1.37,4.95,0.01,6.32,...,3.07,2.89,4.18,3.87,3.59,0.01,4.31,9.11,5.78,5.26
X293dd1284496215e9a0eca9f17a98e7e,2.17,0.69,0.01,10.89,1.57,0.01,0.49,5.14,1.62,6.36,...,3.04,3.18,4.40,3.93,3.92,0.43,4.16,8.66,6.04,5.13
X01ed7190ce00862696edbf047b542045,1.44,1.83,2.35,10.11,2.33,0.01,0.27,6.05,0.01,6.49,...,2.43,2.51,4.19,3.89,3.34,0.01,4.45,8.28,5.56,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X50772aa64efb859960b20f8801cd6f58,0.88,0.16,4.16,10.06,1.69,0.01,0.01,6.18,0.01,5.91,...,3.06,2.84,4.19,3.68,3.26,3.30,4.57,8.23,5.51,5.66
X91bcd3067a1a7954692d836515e04869,1.33,1.16,0.01,9.96,1.64,0.01,1.35,5.57,0.01,6.14,...,4.06,4.12,3.38,3.47,3.60,0.01,4.06,8.17,5.81,5.26
Xc7439a06ffa32b313b0ec1b987b992a2,1.74,0.56,1.20,8.96,0.64,0.01,0.01,5.98,0.01,6.16,...,3.01,3.77,4.44,4.30,2.92,0.21,4.34,7.93,5.10,5.02
X21a6043653d187f8bbead475d2f49791,1.80,1.55,3.14,10.25,1.69,0.01,3.23,6.59,0.01,6.18,...,3.82,4.24,4.14,3.52,3.06,0.01,4.39,9.07,5.92,5.93
