In [1]:
import pandas as pd
import os

# Directory containing the data files
directory = '/Users/ethan/Downloads/tcga_data'

# Initialize dictionaries to store DataFrames for clinical and mutation files
clinical_data = {}
mutation_data = {}

# Loop through each file in the directory
for filename in os.listdir(directory):
    cancer_type = filename.split('_')[0]
    file_path = os.path.join(directory, filename)

    try:
        if filename.endswith('_clinical_patient.txt'):
            # Read clinical patient file, skipping comment lines
            clinical_data[cancer_type] = pd.read_csv(file_path, sep='\t', comment='#', header=0, engine='python')
        elif filename.endswith('_mutations.txt'):
            if cancer_type == 'chol':
                # Skip the first two metadata lines in the chol_mutations.txt file
                mutation_data[cancer_type] = pd.read_csv(file_path, sep='\t', skiprows=2, engine='python')
            else:
                # Read other mutations files normally
                mutation_data[cancer_type] = pd.read_csv(file_path, sep='\t', header=0, engine='python')
    except pd.errors.ParserError as e:
        print(f"Error parsing {filename}: {e}")

# Example: Access the DataFrame for 'chol' cancer type
chol_clinical_df = clinical_data.get('chol')
chol_mutation_df = mutation_data.get('chol')

# Display the first few rows to verify
print(chol_clinical_df.head())
print(chol_mutation_df.head())


     PATIENT_ID                      OTHER_PATIENT_ID  \
0  TCGA-3X-AAV9  41b97b11-acaa-4fbc-b3b0-0abc1bcac13b   
1  TCGA-3X-AAVA  fe57b639-db7a-460f-adfe-552f1e034e46   
2  TCGA-3X-AAVB  3824cd6d-c85c-4b21-819a-932d1afef976   
3  TCGA-3X-AAVC  ff4131c9-537c-46cc-b495-9af60d431f5e   
4  TCGA-3X-AAVE  dcc87c59-fd8c-4511-9983-96d5b9784cb0   

                PRIMARY_SITE_PATIENT                  DISEASE_TYPE  \
0  Liver and intrahepatic bile ducts  Adenomas and Adenocarcinomas   
1  Liver and intrahepatic bile ducts  Adenomas and Adenocarcinomas   
2  Liver and intrahepatic bile ducts  Adenomas and Adenocarcinomas   
3                        Gallbladder  Adenomas and Adenocarcinomas   
4  Liver and intrahepatic bile ducts  Adenomas and Adenocarcinomas   

         PROJECT_NAME PROJECT_ID     SEX   RACE               ETHNICITY  \
0  Cholangiocarcinoma  TCGA-CHOL    Male  ASIAN  NOT HISPANIC OR LATINO   
1  Cholangiocarcinoma  TCGA-CHOL  Female  WHITE  NOT HISPANIC OR LATINO   
2  Cholangi

In [86]:
clinical_data['chol'].columns

Index(['PATIENT_ID', 'OTHER_PATIENT_ID', 'PRIMARY_SITE_PATIENT',
       'DISEASE_TYPE', 'PROJECT_NAME', 'PROJECT_ID', 'SEX', 'RACE',
       'ETHNICITY', 'VITAL_STATUS', 'YEAR_OF_DEATH', 'PRIMARY_DIAGNOSIS',
       'YEAR_OF_DIAGNOSIS', 'PATH_M_STAGE', 'BIOPSY_SITE',
       'AJCC_STAGING_EDITION', 'ICD_10', 'AGE', 'PATH_STAGE', 'MORPHOLOGY',
       'PATH_T_STAGE', 'PRIOR_TREATMENT', 'PATH_N_STAGE', 'PRIOR_MALIGNANCY',
       'PROJECT_STATE', 'OS_STATUS', 'OS_MONTHS', 'DFS_STATUS', 'DFS_MONTHS'],
      dtype='object')

In [87]:
mutation_data.get('lihc')['t_ref_count']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
53772   NaN
53773   NaN
53774   NaN
53775   NaN
53776   NaN
Name: t_ref_count, Length: 53777, dtype: float64

In [88]:
mutation_data['chol']['Tumor_Sample_Barcode']

0       TCGA-3X-AAV9-01A
1       TCGA-3X-AAV9-01A
2       TCGA-3X-AAV9-01A
3       TCGA-3X-AAV9-01A
4       TCGA-3X-AAV9-01A
              ...       
3759    TCGA-ZU-A8S4-01A
3760    TCGA-ZU-A8S4-01A
3761    TCGA-ZU-A8S4-01A
3762    TCGA-ZU-A8S4-01A
3763    TCGA-ZU-A8S4-01A
Name: Tumor_Sample_Barcode, Length: 3764, dtype: object

In [89]:
clinical_data['chol']['PATIENT_ID']

0     TCGA-3X-AAV9
1     TCGA-3X-AAVA
2     TCGA-3X-AAVB
3     TCGA-3X-AAVC
4     TCGA-3X-AAVE
5     TCGA-4G-AAZF
6     TCGA-4G-AAZG
7     TCGA-4G-AAZO
8     TCGA-4G-AAZR
9     TCGA-4G-AAZT
10    TCGA-5A-A8ZF
11    TCGA-5A-A8ZG
12    TCGA-W5-AA2G
13    TCGA-W5-AA2H
14    TCGA-W5-AA2I
15    TCGA-W5-AA2J
16    TCGA-W5-AA2K
17    TCGA-W5-AA2M
18    TCGA-W5-AA2O
19    TCGA-W5-AA2Q
20    TCGA-W5-AA2R
21    TCGA-W5-AA2T
22    TCGA-W5-AA2U
23    TCGA-W5-AA2W
24    TCGA-W5-AA2X
25    TCGA-W5-AA2Z
26    TCGA-W5-AA30
27    TCGA-W5-AA31
28    TCGA-W5-AA33
29    TCGA-W5-AA34
30    TCGA-W5-AA36
31    TCGA-W5-AA38
32    TCGA-W5-AA39
33    TCGA-W6-AA0S
34    TCGA-W6-AA0T
35    TCGA-W7-A93N
36    TCGA-W7-A93O
37    TCGA-W7-A93P
38    TCGA-WD-A7RX
39    TCGA-YR-A95A
40    TCGA-ZD-A8I3
41    TCGA-ZH-A8Y1
42    TCGA-ZH-A8Y2
43    TCGA-ZH-A8Y3
44    TCGA-ZH-A8Y4
45    TCGA-ZH-A8Y5
46    TCGA-ZH-A8Y6
47    TCGA-ZH-A8Y7
48    TCGA-ZH-A8Y8
49    TCGA-ZK-AAYZ
50    TCGA-ZU-A8S4
Name: PATIENT_ID, dtype: object

In [90]:
mutation_data['chol'][['Hugo_Symbol', 'Entrez_Gene_Id', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Reference_Allele', 'Variant_Type', 'Start_Position', 'End_Position', 'Chromosome', 'Variant_Classification', 'Match_Norm_Seq_Allele1',
'Match_Norm_Seq_Allele2']]

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Tumor_Seq_Allele1,Tumor_Seq_Allele2,Reference_Allele,Variant_Type,Start_Position,End_Position,Chromosome,Variant_Classification,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2
0,DOCK7,85440,C,T,C,SNP,62552814,62552814,1,Missense_Mutation,,
1,LRRC71,149499,C,T,C,SNP,156932006,156932006,1,Missense_Mutation,,
2,PTPN14,5784,G,A,G,SNP,214383971,214383971,1,Silent,,
3,OR2T12,127064,G,A,G,SNP,248295007,248295007,1,Missense_Mutation,,
4,PARGP1,728407,T,G,T,SNP,45892481,45892481,10,RNA,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3759,PI15,51050,C,A,C,SNP,74825486,74825486,8,Silent,,
3760,KIAA2026,158358,G,A,G,SNP,6007401,6007401,9,Silent,,
3761,RABGAP1,23637,G,C,G,SNP,122990131,122990131,9,Missense_Mutation,,
3762,RABGAP1,23637,C,T,C,SNP,122990132,122990132,9,Missense_Mutation,,


In [2]:
import pandas as pd
import os

# Directory containing the data files
directory = '/Users/ethan/Downloads/tcga_data'

# Reference index mapping cancer acronyms to project names
disease_labels = {
    "sarc": "Sarcoma",
    "kirp": "Kidney Renal Papillary Cell Carcinoma",
    "esca": "Esophageal Carcinoma",
    "hnsc": "Head and Neck Squamous Cell Carcinoma",
    "chol": "Cholangiocarcinoma",
    "prad": "Prostate Adenocarcinoma",
    "ucec": "Uterine Corpus Endometrial Carcinoma",
    "kich": "Kidney Chromophobe",
    "laml": "Acute Myeloid Leukemia",
    "ov": "Ovarian Serous Cystadenocarcinoma",
    "uvm": "Uveal Melanoma",
    "lgg": "Brain Lower Grade Glioma",
    "brca": "Breast Invasive Carcinoma",
    "lihc": "Liver Hepatocellular Carcinoma",
    "luad": "Lung Adenocarcinoma"
}

# Initialize dictionaries to store DataFrames for clinical and mutation files
clinical_data = {}
mutation_data = {}

# Loop through each file in the directory
for filename in os.listdir(directory):
    cancer_type = filename.split('_')[0]
    file_path = os.path.join(directory, filename)

    try:
        if filename.endswith('_clinical_patient.txt'):
            # Read clinical patient file, skipping comment lines
            df = pd.read_csv(file_path, sep='\t', comment='#', header=0, engine='python')
            # Add the Disease_Label column based on the mapping
            df['Disease_Label'] = disease_labels.get(cancer_type, "Unknown")
            clinical_data[cancer_type] = df
        elif filename.endswith('_mutations.txt'):
            if cancer_type == 'chol':
                mutation_data[cancer_type] = pd.read_csv(file_path, sep='\t', skiprows=2, engine='python')
            else:
                mutation_data[cancer_type] = pd.read_csv(file_path, sep='\t', header=0, engine='python')
    except pd.errors.ParserError as e:
        print(f"Error parsing {filename}: {e}")

# Combine all clinical DataFrames into one and save as CSV
all_clinical_df = pd.concat(clinical_data.values(), ignore_index=True)
all_clinical_df.to_csv('/Users/ethan/Downloads/all_clinical_data_with_labels.csv', index=False)

# Combine all mutation DataFrames into one and save as CSV
all_mutation_df = pd.concat(mutation_data.values(), ignore_index=True)
all_mutation_df.to_csv('/Users/ethan/Downloads/all_mutation_data.csv', index=False)

print("All clinical data with disease labels and mutation data saved as CSV files.")



All clinical data with disease labels and mutation data saved as CSV files.


In [92]:
all_mutation_df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,dbSNP,n_ref_count_val,tumor_vaf_val,n_alt_count_val,normal_vaf_val,dbSNP_Strength,Confidence,NVarRat,TVarRat,dbSNP_PopFreq
0,ABCA10,10349.0,genome.wustl.edu,GRCh37,17,67218779,67218780,+,frameshift_variant,Frame_Shift_Ins,...,,,,,,,,,,
1,ABCC8,6833.0,genome.wustl.edu,GRCh37,11,17418754,17418754,+,missense_variant,Missense_Mutation,...,,,,,,,,,,
2,ACACA,31.0,genome.wustl.edu,GRCh37,17,35518913,35518913,+,missense_variant,Missense_Mutation,...,,,,,,,,,,
3,ADAR,103.0,genome.wustl.edu,GRCh37,1,154562755,154562755,+,missense_variant,Missense_Mutation,...,,,,,,,,,,
4,ADAR,103.0,genome.wustl.edu,GRCh37,1,154569329,154569329,+,missense_variant,Missense_Mutation,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706834,TMEM170B,100113407.0,hgsc.bcm.edu,GRCh37,6,11575790,11575797,+,"coding_sequence_variant,3_prime_UTR_variant",Frame_Shift_Del,...,,,,,,none,PASS,0,0.138889,.
706835,CSMD2,114784.0,hgsc.bcm.edu,GRCh37,1,34008360,34008360,+,"synonymous_variant,NMD_transcript_variant",Silent,...,,,,,,alt,PASS,.,.,.
706836,CSMD2,114784.0,hgsc.bcm.edu,GRCh37,1,33985175,33985175,+,"synonymous_variant,NMD_transcript_variant",Silent,...,,,,,,none,PASS,.,.,.
706837,CTDNEP1,23399.0,hgsc.bcm.edu,GRCh37,17,7147902,7147902,+,"synonymous_variant,NMD_transcript_variant",Silent,...,,,,,,none,PASS,.,.,.


In [93]:
all_mutation_df['Tumor_Sample_Barcode']

0         TCGA-JY-A6FG-01
1         TCGA-JY-A6FG-01
2         TCGA-JY-A6FG-01
3         TCGA-JY-A6FG-01
4         TCGA-JY-A6FG-01
               ...       
706834    TCGA-5P-A9KC-01
706835    TCGA-WN-AB4C-01
706836    TCGA-A4-8630-01
706837    TCGA-PJ-A5Z8-01
706838    TCGA-BQ-5881-01
Name: Tumor_Sample_Barcode, Length: 706839, dtype: object

In [94]:
all_clinical_df

Unnamed: 0,OTHER_PATIENT_ID,PATIENT_ID,FORM_COMPLETION_DATE,PROSPECTIVE_COLLECTION,RETROSPECTIVE_COLLECTION,SEX,MENOPAUSE_STATUS,RACE,ETHNICITY,HISTORY_OTHER_MALIGNANCY,...,PRIMARY_DIAGNOSIS,YEAR_OF_DIAGNOSIS,BIOPSY_SITE,PATH_STAGE,MORPHOLOGY,PRIOR_TREATMENT,PRIOR_MALIGNANCY,PROJECT_STATE,SARCOMATOID_FEATURES,SARCOMATOID_PERCENT_OF_TUMOR
0,6E7D5EC6-A469-467C-B748-237353C23416,TCGA-3C-AAAU,1/13/14,NO,YES,Female,Pre (<6 months since LMP AND no prior bilatera...,WHITE,NOT HISPANIC OR LATINO,No,...,,,,,,,,,,
1,55262FCB-1B01-4480-B322-36570430C917,TCGA-3C-AALI,7/28/14,NO,YES,Female,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,No,...,,,,,,,,,,
2,427D0648-3F77-4FFC-B52C-89855426D647,TCGA-3C-AALJ,7/28/14,NO,YES,Female,Post (prior bilateral ovariectomy OR >12 mo si...,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,No,...,,,,,,,,,,
3,C31900A4-5DCD-4022-97AC-638E86E889E4,TCGA-3C-AALK,7/28/14,NO,YES,Female,[Not Available],BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,No,...,,,,,,,,,,
4,6623FC5E-00BE-4476-967A-CBD55F676EA6,TCGA-4H-AAAK,11/13/14,YES,NO,Female,Post (prior bilateral ovariectomy OR >12 mo si...,WHITE,NOT HISPANIC OR LATINO,No,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5850,DFBE9B61-1F13-4BD1-8F87-364B186C69B5,TCGA-UW-A7GP,12/17/13,NO,YES,Male,,WHITE,[Not Available],No,...,,,,,,,,,[Not Available],[Not Available]
5851,A86C2490-37DA-46A3-88E2-0B0CE21D067F,TCGA-UW-A7GR,12/17/13,NO,YES,Female,,WHITE,[Not Available],"Yes, History of Prior Malignancy",...,,,,,,,,,[Not Available],[Not Available]
5852,F300EFC6-A125-468A-BB2F-FBAAC71FCCA0,TCGA-UW-A7GU,12/17/13,NO,YES,Male,,BLACK OR AFRICAN AMERICAN,[Not Available],No,...,,,,,,,,,[Not Available],[Not Available]
5853,D0A8CFDC-C025-4DE9-94AD-F5160809FF8F,TCGA-UW-A7GX,12/17/13,YES,NO,Male,,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,No,...,,,,,,,,,NO,[Not Available]


In [3]:
import pandas as pd

# Extract patient identifier (first 12 characters) from Tumor_Sample_Barcode in mutation data
all_mutation_df['PATIENT_ID'] = all_mutation_df['Tumor_Sample_Barcode'].str[:12]

# Specify the columns to keep from mutation data
mutation_columns = [
    "Hugo_Symbol", "Entrez_Gene_Id", "Chromosome", "Start_Position",
    "End_Position", "Strand", "Consequence", "Variant_Classification",
    "Variant_Type", "Reference_Allele", "Tumor_Seq_Allele1",
    "Tumor_Seq_Allele2", "Mutation_Status", "t_ref_count", "t_alt_count",
    "n_ref_count", "n_alt_count", "PATIENT_ID"
]

# Specify the columns to keep from clinical data
clinical_columns = ["PATIENT_ID", "RACE", "SEX", 'Disease_Label']

# Filter the datasets to keep only the specified columns
all_mutation_df = all_mutation_df[mutation_columns]
all_clinical_df = all_clinical_df[clinical_columns]

# Perform the merge (one-to-many join) on the PATIENT_ID column
merged_df = pd.merge(all_mutation_df, all_clinical_df, on="PATIENT_ID", how="inner")

# Save the resulting merged dataset to a CSV file
merged_df.to_csv('/Users/ethan/Downloads/merged_clinical_mutation_data.csv', index=False)

print("Merged dataset saved as 'merged_clinical_mutation_data.csv'.")


  all_mutation_df['PATIENT_ID'] = all_mutation_df['Tumor_Sample_Barcode'].str[:12]


Merged dataset saved as 'merged_clinical_mutation_data.csv'.


In [96]:
na_counts = merged_df.isnull().sum()

# Display the results
for column, count in na_counts.items():
    print(f"Column '{column}': {count} NA/null values")

Column 'Hugo_Symbol': 0 NA/null values
Column 'Entrez_Gene_Id': 15970 NA/null values
Column 'Chromosome': 0 NA/null values
Column 'Start_Position': 0 NA/null values
Column 'End_Position': 0 NA/null values
Column 'Strand': 42 NA/null values
Column 'Consequence': 118 NA/null values
Column 'Variant_Classification': 0 NA/null values
Column 'Variant_Type': 0 NA/null values
Column 'Reference_Allele': 0 NA/null values
Column 'Tumor_Seq_Allele1': 0 NA/null values
Column 'Tumor_Seq_Allele2': 0 NA/null values
Column 'Mutation_Status': 0 NA/null values
Column 't_ref_count': 103680 NA/null values
Column 't_alt_count': 103680 NA/null values
Column 'n_ref_count': 441493 NA/null values
Column 'n_alt_count': 441493 NA/null values
Column 'PATIENT_ID': 0 NA/null values
Column 'RACE': 218 NA/null values
Column 'SEX': 175 NA/null values
Column 'Disease_Label': 0 NA/null values


In [4]:
# Replace NA values with empty strings in specific columns
columns_to_fill = ['t_ref_count', 't_alt_count', 'n_ref_count', 'n_alt_count']
merged_df[columns_to_fill] = merged_df[columns_to_fill].fillna('')

# Drop rows with NA values in the remaining columns
cleaned_df = merged_df.dropna()

# Save the cleaned dataset to a CSV file
cleaned_df.to_csv('/Users/ethan/Downloads/cleaned_BERT_gene_data.csv', index=False)

print("Cleaned dataset saved as 'cleaned_BERT_gene_data.csv'.")


Cleaned dataset saved as 'cleaned_BERT_gene_data.csv'.


In [5]:
cleaned_df['Disease_Label'].value_counts()

Uterine Corpus Endometrial Carcinoma     182949
Head and Neck Squamous Cell Carcinoma    119607
Breast Invasive Carcinoma                 86641
Lung Adenocarcinoma                       67939
Esophageal Carcinoma                      56961
Liver Hepatocellular Carcinoma            53722
Prostate Adenocarcinoma                   39740
Ovarian Serous Cystadenocarcinoma         20057
Kidney Renal Papillary Cell Carcinoma     18863
Sarcoma                                   18533
Brain Lower Grade Glioma                   9876
Kidney Chromophobe                         7543
Cholangiocarcinoma                         3546
Acute Myeloid Leukemia                     2475
Uveal Melanoma                             2156
Name: Disease_Label, dtype: int64

In [102]:
cleaned_df.columns

Index(['Hugo_Symbol', 'Entrez_Gene_Id', 'Chromosome', 'Start_Position',
       'End_Position', 'Strand', 'Consequence', 'Variant_Classification',
       'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1',
       'Tumor_Seq_Allele2', 'Mutation_Status', 't_ref_count', 't_alt_count',
       'n_ref_count', 'n_alt_count', 'PATIENT_ID', 'RACE', 'SEX',
       'Disease_Label'],
      dtype='object')

In [6]:
# Define the columns to concatenate
columns_to_concatenate = [
    'Hugo_Symbol', 'Entrez_Gene_Id', 'Chromosome', 'Start_Position',
    'End_Position', 'Strand', 'Consequence', 'Variant_Classification',
    'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1',
    'Tumor_Seq_Allele2', 'Mutation_Status', 't_ref_count', 't_alt_count',
    'n_ref_count', 'n_alt_count', 'RACE', 'SEX'
]

# Concatenate specified columns into one field for each row
cleaned_df['Combined'] = cleaned_df[columns_to_concatenate].astype(str).agg(' '.join, axis=1)

# Group by PATIENT_ID and aggregate rows into one by concatenating 'Combined' values
patient_level_df = (
    cleaned_df.groupby(['PATIENT_ID'], as_index=False)
    .agg({'Combined': ' '.join, 'Disease_Label': 'first'})  # Use 'first' to keep the same Disease_Label for the patient
)

# Set PATIENT_ID as the index
patient_level_df = patient_level_df.set_index('PATIENT_ID')

# Save the resulting dataset
patient_level_df.to_csv('/Users/ethan/Downloads/patient_level_data.csv')

print("Processed patient-level dataset saved as 'patient_level_data.csv'.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Combined'] = cleaned_df[columns_to_concatenate].astype(str).agg(' '.join, axis=1)


Processed patient-level dataset saved as 'patient_level_data.csv'.


In [7]:
patient_level_df

Unnamed: 0_level_0,Combined,Disease_Label
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-04-1331,BAI2 576.0 1 32202221 32202221 + missense_vari...,Ovarian Serous Cystadenocarcinoma
TCGA-04-1332,SERPINF1 5176.0 17 1674326 1674326 + missense_...,Ovarian Serous Cystadenocarcinoma
TCGA-04-1336,CPXM1 56265.0 20 2778850 2778850 + missense_va...,Ovarian Serous Cystadenocarcinoma
TCGA-04-1337,ABCA3 21.0 16 2336768 2336768 + missense_varia...,Ovarian Serous Cystadenocarcinoma
TCGA-04-1338,A2M 2.0 12 9230420 9230420 + missense_variant ...,Ovarian Serous Cystadenocarcinoma
...,...,...
TCGA-ZS-A9CD,ARID1A 8289.0 1 27106649 27106649 + missense_v...,Liver Hepatocellular Carcinoma
TCGA-ZS-A9CE,TCEA3 6920.0 1 23720511 23720511 + missense_va...,Liver Hepatocellular Carcinoma
TCGA-ZS-A9CF,PTCHD2 57540.0 1 11596435 11596435 + missense_...,Liver Hepatocellular Carcinoma
TCGA-ZS-A9CG,HSPG2 3339.0 1 22178363 22178363 + synonymous_...,Liver Hepatocellular Carcinoma


In [111]:
# Calculate the length of each value in the 'Combined' column
combined_lengths = patient_level_df['Combined'].str.len()

# Find the maximum length
max_length = combined_lengths.mean()

print(f"The maximum length of the values in the 'Combined' column is: {max_length}")


The maximum length of the values in the 'Combined' column is: 17888.33025099075
