In [1]:
import pandas as pd

In [2]:
DATA_DIRECTORY = "~/Box/Goecks Precision Oncology Analytics/Projects/Pathway Prediction ML/Xy_matrices/"
GENES = ["TP53", "CDKN2A", "PIK3CA"]

# Read X, y matrics for genes.
X_dfs = [pd.read_csv('%s/%s/X_matrix.tsv' % (DATA_DIRECTORY, gene), delimiter="\t", header=0) for gene in GENES]
y_dfs = [pd.read_csv('%s/%s/y_matrix.tsv' % (DATA_DIRECTORY, gene), delimiter="\t", header=0) for gene in GENES]

In [3]:
# Sanity checks.
y_dfs[-1].head()

Unnamed: 0,SAMPLE_BARCODE,PIK3CA_snv,PIK3CA_gain,Class,PATIENT_BARCODE,DISEASE,SUBTYPE,Train_test_stratification
0,TCGA-02-0047-01,1,0,1,TCGA-02-0047,GBM,IDHwt,GBM1
1,TCGA-02-0055-01,0,0,0,TCGA-02-0055,GBM,IDHwt,GBM0
2,TCGA-02-2483-01,0,0,0,TCGA-02-2483,GBM,IDHmut-non-codel,GBM0
3,TCGA-02-2485-01,1,0,1,TCGA-02-2485,GBM,IDHwt,GBM1
4,TCGA-02-2486-01,0,0,0,TCGA-02-2486,GBM,IDHwt,GBM0


In [4]:
# Get columns common to X dataframes. These are the genes shared across the datasets.
shared_cols = X_dfs[0].columns
for X_df in X_dfs[1:]:
    shared_cols = list(set(shared_cols).intersection(X_df.columns))

# Get samples common to X dataframes. These are the smaples shared across the datasets.
shared_samples = y_dfs[0]['SAMPLE_BARCODE']
for y_df in y_dfs[1:]:
    shared_samples = set(shared_samples).intersection(y_df['SAMPLE_BARCODE'])

print( len(shared_cols), len(shared_samples) )

7434 3548


In [5]:
# Create X_df shared intersection/subset by filtering on genes (columns) and samples. Conveniently, this includes the SAMPLE_BARCODE column.
# This subsetting only has to be done once and can be done on any X_df as it's the intersection of all.
X_df_subset = X_dfs[0]
X_df_subset = X_df_subset[X_df_subset['SAMPLE_BARCODE'].isin(shared_samples)]
X_df_subset = X_df_subset[shared_cols]
X_df_subset.shape

(3548, 7434)

In [6]:
# Add in y_df columns to X_df_subset to create a final merged dataframe.

merged_df = X_df_subset
for i, y_df in enumerate(y_dfs):
    merged_df = pd.merge(merged_df, y_df, how="inner", on=['SAMPLE_BARCODE'])
    merged_df = merged_df.rename(columns={'Class': '%s_Class' % GENES[i]})

merged_df.head()

Unnamed: 0,LUC7L,RABL2A,RASAL2,PIP5KL1,ANK2,CD276,STAG3,YTHDC2,PIM1,EPB41L5,...,DISEASE_y,SUBTYPE_y,Train_test_stratification_y,PIK3CA_snv,PIK3CA_gain,PIK3CA_Class,PATIENT_BARCODE,DISEASE,SUBTYPE,Train_test_stratification
0,5.17928,4.184908,2.93699,1.400732,6.225519,7.149271,4.083632,3.156297,3.763293,4.308488,...,GBM,IDHwt,GBM1,1,0,1,TCGA-02-0047,GBM,IDHwt,GBM1
1,4.960511,3.649229,2.741623,0.633487,3.991664,7.256426,2.071908,2.712321,5.840672,2.396991,...,GBM,IDHwt,GBM0,0,0,0,TCGA-02-0055,GBM,IDHwt,GBM0
2,4.865581,3.107788,2.116438,2.384534,3.877625,7.11003,3.283607,2.354879,4.279469,3.289845,...,GBM,IDHmut-non-codel,GBM1,0,0,0,TCGA-02-2483,GBM,IDHmut-non-codel,GBM0
3,6.198651,4.328006,2.296484,3.079808,5.740784,6.92503,3.34723,3.601701,5.515626,4.112717,...,GBM,IDHwt,GBM0,1,0,1,TCGA-02-2485,GBM,IDHwt,GBM1
4,4.261574,4.744347,1.435818,0.578575,6.044545,6.245318,2.853208,2.230125,4.287014,3.116334,...,GBM,IDHwt,GBM1,0,0,0,TCGA-02-2486,GBM,IDHwt,GBM0


In [13]:
merged_df.filter(regex=(".*_Class|SAMPLE_BARCODE"))

Unnamed: 0,SAMPLE_BARCODE,TP53_Class,CDKN2A_Class,PIK3CA_Class
0,TCGA-02-0047-01,0,1,1
1,TCGA-02-0055-01,1,0,0
2,TCGA-02-2483-01,1,1,0
3,TCGA-02-2485-01,1,0,1
4,TCGA-02-2486-01,0,1,0
...,...,...,...,...
3543,TCGA-ZF-AA58-01,1,0,0
3544,TCGA-ZF-AA5H-01,1,0,0
3545,TCGA-ZF-AA5N-01,0,0,0
3546,TCGA-ZF-AA5P-01,1,0,1


In [14]:
# Create final Y combined matrix.
y_combined_df = merged_df.filter(regex=(".*_Class|SAMPLE_BARCODE"))

print( X_df_subset.shape )
print( y_combined_df.shape )

# Write final matrices to files.
X_df_subset.to_csv('X_matrix_subset.tsv', sep="\t", index=False)
y_combined_df.to_csv('y_matrix.tsv', sep='\t', index=False)

(3548, 7434)
(3548, 4)
