# Tumor vs Normal Expression

For cancers with a significant number of control samples, the difference in miRNA expression between tumor and control samples can be determined.

## Formatting for DESeq2

First, we can look at the number of control samples and keep only those with more than 10 control samples. For some of the control samples, there is not a corresponding tumor samples, so those are excluded from our analysis.

In [None]:
import os
import pandas as pd

def deseq2_format(summary_file, output_dir, min_samp=10):
    """
        summary_file: TCGA project summary file (made in 1.5)
        output_dir: directory to save files to 
        min_samp (default=10): minimum number of paired tumor and normal samples required
    """

    sum_df = pd.read_csv("10_TCGA_PanCancer/Project Summary.csv", header=0, index_col=0)
    df = sum_df[sum_df["miRNA Normal Samples"] > min_samp]
    project_list = df.index.tolist()

    for project in project_list:
        print "Processing {}".format(project)
        mir_file = "../TCGA Data Processing/TCGA Summary/miRNA-seq/{} miRNA-seq Raw Reads.csv".format(project)
        mir = pd.read_csv(mir_file, header=0, index_col=0)

        # find paired tumor and normal samples
        control_samps = filter(lambda x: "_11" in x or "_10" in x, mir.columns.tolist())
        patients = [y.split("_")[0] for y in control_samps]
        # solid tumors
        if "_11" in control_samps[0]:
            tumor_samps = ["{}_01".format(z) for z in patients]
        # blood cancers
        else:
            tumor_samps = ["{}_03".format(z) for z in patients]

        tumor_samps_exist = [a for a in tumor_samps if a in mir.columns]
        if len(tumor_samps_exist) > min_samp:
            if len(tumor_samps) != len(tumor_samps_exist):
                print "Not all {} normal samples have corresponding tumor samples: {}/{}".format(project, 
                                                                                                 len(tumor_samps_exist), 
                                                                                                 len(tumor_samps))
                patients = [y.split("_")[0] for y in tumor_samps_exist]
                if "_11" in control_samps[0]:
                    control_samps = ["{}_11".format(z) for z in patients]
                else:
                    control_samps = ["{}_10".format(z) for z in patients]
                tumor_samps = tumor_samps_exist
        else:
            print "{} or less paired samples for {}. Skipping...".format(min_samp, project)
            continue
        tumor_df = mir[tumor_samps]
        control_df = mir[control_samps]
        out_df = control_df.join(tumor_df)
        out_file = "{}/{} tumor and control miRNA-seq Read Counts.csv".format(output_dir, project)
        out_df.to_csv(out_file)

        # output column data
        col_df = pd.DataFrame(index=control_samps+tumor_samps, 
                              data={"Patient": patients*2, 
                                    "Type": ["Control"]*len(patients)+["Tumor"]*len(patients)})
        col_file = "{}/{} tumor and control miRNA-seq Column Data.csv".format(output_dir, project)
        col_df.to_csv(col_file)