In [1]:
import pandas as pd

Load data

In [2]:
df1 = pd.read_table('GSM1536837_06_01_15_TCGA_24.tumor_Rsubread_TPM.txt', index_col=0)
df2 = pd.read_table('GSM1697009_06_01_15_TCGA_24.normal_Rsubread_TPM.txt', index_col=0)
df = pd.concat([df1, df2], axis=1, sort=False)

Manage duplicate samples

In [3]:
# get sample names
samples_full = df.columns.tolist()
samples = [x[:16] for x in df.columns.tolist()]

# decide which samples to keep
keep = []
for i in range(len(samples)):
    
    # find duplicate samples
    others = [j for j,x in enumerate(samples) if x == samples[i]]

    # if duplicate samples
    if len(others) > 1:
        
        # check 1: analytes: prefer H > R > T
        # check if only 1 H
        analytes_H = [j for j,x in enumerate([samples_full[k] for k in others]) if x[19]=='H']
        if len(analytes_H) == 1:
            
            # found match
            if analytes_H[0] not in keep:
                keep.append(analytes_H[0])
                
        else:
            
            # check if only 1 R
            analytes_R = [j for j,x in enumerate([samples_full[k] for k in others]) if x[19]=='R']
            if len(analytes_R) == 1:
                
                # found match
                if analytes_R[0] not in keep:
                    keep.append(analytes_R[0])
                    
            else:
                
                # check 2: highest lexicographical sort value
                sorted_values = sorted([samples_full[k] for k in others])
                for j in others:
                    if samples_full[j] == sorted_values[-1]:
                        if j not in keep:
                            keep.append(j)
                            
    # if no duplicate samples
    else:
        keep.append(i)

In [4]:
# remove duplicate samples
df = df[[samples_full[i] for i in keep]]
    
# edit sample names
df.columns = [x[:16] for x in df.columns.tolist()]

Export

In [11]:
df.index.name = 'Gene Symbol'
df.to_csv('../TCGA.txt', sep='\t')