In [None]:
import pandas as pd
import numpy as np

# 1. Demo

## 1.1 Read Data

In [None]:
fn_a = "KIRC/KIRC-Ca.txt"

In [None]:
df_a = pd.read_csv(fn_a, sep='\t', index_col='Tags')
df_a.head()

In [None]:
df_b = pd.read_csv(fn_b, sep='\t', index_col='Tags')
df_b.head()

## 1.2 Process Data

In [None]:
# matrix transpose
df_a = df_a.T
df_b = df_b.T

In [None]:
# check average values
def summary(df):
    row_avg = df.mean(1)
    print('row averages is between {} and {}'.format(row_avg.min(), row_avg.max()))

# filter rows (according to row average threshold)
def filter_avg(df, thres=1.0):
    print("before filtering: {}".format(df.shape))
    row_avg = df.mean(1)
    good_avg = df.loc[row_avg >= thres]
    print("after filtering: {}".format(good_avg.shape))
    return good_avg

In [None]:
summary(df_a)
summary(df_b)

In [None]:
# labeling tumour samples
df_a['Label'] = 1
df_a.head()

In [None]:
# labeling normal samples
df_b['Label'] = 0
df_b.head()

In [None]:
# merge
df_merged = pd.concat([df_a, df_b])
df_merged.shape

# 2. Process every data

In [None]:
cas = ["BLCA", "BRCA", "GBM", "HNSC", "KIRC", "LUAD", "LUSC", "UCEC"]

In [None]:
def read_ca_file(ca_name):
    ca_fi = "{}/{}-Ca.txt".format(ca_name, ca_name)
    cas_fi = "{}/{}-CaS.txt".format(ca_name, ca_name)
    df_ca = pd.read_csv(ca_fi, sep='\t', index_col='Tags')
    df_cas = pd.read_csv(cas_fi, sep='\t', index_col='Tags')
    return df_ca, df_cas

In [None]:
def process(df_a, df_b):
    # transpose
    df_a = df_a.T
    df_b = df_b.T

    # filter
    df_a = filter_avg(df_a)
    df_b = filter_avg(df_b)

    # label
    df_a['Label'] = 1
    df_b['Label'] = 0
    
    # merge
    df_merged = pd.concat([df_a, df_b])
    print('Total samples: {}'.format(df_merged.shape))
    # save
    return df_merged

In [None]:
for ca_name in cas:
    print('Doing: ', ca_name)
    df_ca, df_cas = read_ca_file(ca_name)
    df_m = process(df_ca, df_cas)
    # 输出
    df_m.to_csv('./OUT/{}-P.csv'.format(ca_name, ca_name), index=False, header=False, sep='\t')