In [None]:
import pandas as pd
import xlrd
import numpy as np

#imports required modules for data analysis

In [None]:
PFI = pd.read_excel('/Users/Jacob/Downloads/42003_2023_5459_MOESM14_ESM.xlsx', sheet_name='Fig. 2b-f')
#reads in dataset containing PFI status and PFI time
clinical = pd.read_csv("/Users/Jacob/Downloads/clinical.cohort.2024-03-06/clinical.tsv", delimiter= "\t")
#reads in clinical data including age
df = pd.read_csv('/Users/Jacob/Downloads/EB++AdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.xena', delimiter="\t")
#reads in datafile of log normalized epigenetic expression values


In [None]:
def sample_name_adjust(df):
    """Input: dataframe of epigenetic data. A function to adjust the names of each sample in the dataset. 
    Output: dataframe of epigenetic data with adjusted column names. 
    These adjusted names are used as column headers. 
    Adjusted names use first 12 characters of original name. 
    This removes the numerical tag at then end of the name, matching titles to corresponding ones later 
    found in metadata file, allowing for successful merge."""
    new_cols = []
    for item in df.columns:
        new_cols.append(item[:12])
    #loops through list of old names, adding truncated versions to new_cols list
    df.columns = new_cols
    df = df.loc[:,~df.columns.duplicated()].copy()
    #removes duplicate samples created by removal of differentiating tags
    df.index = df["sample"]
    #sets index to names of genes examined
    return df

df = sample_name_adjust(df)

In [None]:
clinical.index = clinical["case_submitter_id"]
clinical = clinical.drop_duplicates(subset= "case_submitter_id")
#creates index for clinical dataframe using patient IDs, then drops duplicate patient data

age = clinical["days_to_birth"]
#initializes list of ages using days to birth clinical data

PFI.index = PFI["patient_id"]
#creates index for PFI dataframe using patient IDs

In [None]:
def prep_FINAL(PFI, age):
    """
    Input: dataframe of PFIs, series of ages formatted in days from birth (negative values)
    Output: dataframe of PFIs with additional column for age, titled and formatted in days to birth
    """
    FINAL = pd.concat([PFI, age], axis = 1)
    #creates matrix "FINAL" including PFI and AGE
    FINAL = FINAL.dropna(subset="cancer_type")
    #drops samples without values in cancertype
    FINAL = FINAL.replace({'\'--': np.nan})
    #replaces non-standard nan value with nan
    FINAL = FINAL.dropna(subset="days_to_birth")
    #drops samples without valid days_to_birth value

    days_from_birth = []
    for item in FINAL.index:
        days_from_birth.append(int(FINAL.loc[item]["days_to_birth"][1:]))
    D = {"days_from_birth": days_from_birth}
    days_from_birth = pd.DataFrame(D)
    days_from_birth.index = FINAL.index
    #makes dataframe days_from_birth with non-negative values of days_to_birth, using index of FINAL

    FINAL = pd.concat([FINAL, days_from_birth], axis=1)
    FINAL = FINAL.drop("days_to_birth", axis = 1)
    #adds days from birth and drops days to birth from final

    return FINAL

FINAL = prep_FINAL(PFI, age)

In [None]:

def division(list, data):
    """
    Input: list = list of cutoffs (percentiles in decimal form) relative to age distribution 
    which are used to divide each sample into older/younger category. 
    data = dataframe containing samples to divide into older/younger
    returns: original dataframe, with new columns containing older/younger labels for each cutoff supplied in list
    """
    for percentile in list:
        cutoff = FINAL["days_from_birth"].quantile(percentile)
        #calculates the percentile cutoff in temrs of days
        Young_Old = []

        for item in FINAL.index:
            if FINAL.loc[item]["days_from_birth"] > cutoff:
                Young_Old.append("Older")
            else:
                Young_Old.append("Younger")
        #for each sample in teh data, if a sample is older than the cutoff, appends older to that position in the list. 
        #if the sample is younger than the cutoff, appends younger to analagous positition in list

        Young_Old_DF = pd.DataFrame({"Age"+ str(percentile)[2:] : Young_Old} )
        Young_Old_DF.index = FINAL.index
        #turns the list of older/younger into one column dataframe, with index of FINAL and column title of Age + cutoff percentile 

        data = pd.concat([data, Young_Old_DF], axis=1)
        #updates df "data" to include the new column
    return data


cutoffs = [0.25, 0.50, 0.75]
FINAL = division(cutoffs, FINAL)
#runs division on "FINAL" data

In [None]:
FINAL

In [None]:
FINAL.to_csv("/Users/Jacob/Desktop/OldYoung.csv")