# TCGA Project File Summary

A summary with the number of each type of files for each TCGA project can be created. The sample type code information can be found <a href="https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes">here</a>.

In [None]:
import os
import pandas as pd
from collections import Counter

def make_summary(clin_dir, mrna_dir, mirna_dir):
    """
        clin_dir: directory with clinical summary files
        mrna_dir: directory with mRNA-seq summary files
        mirna_dir: directory with miRNA-seq summary files
    """

    projects = ['CESC', 'UCEC', 'OV', 'THCA', 'STAD', 'CHOL', 'UCS', 'DLBC', 'UVM', 'MESO', 
                'ACC', 'KICH', 'THYM', 'TGCT', 'READ','BRCA', 'GBM', 'LUAD', 'KIRC', 'HNSC', 
                'LGG', 'LUSC', 'PRAD', 'SKCM', 'COAD', 'BLCA', 'LIHC', 'KIRP', 'SARC', 
                'LAML', 'ESCA', 'PAAD', 'PCPG']
    df = pd.DataFrame(columns=["Clinical Data", "mRNA Samples", "mRNA Primary Tumor", "mRNA Recurrent Tumor", 
                               "mRNA Normal Samples", "mRNA Metastasis Samples", "miRNA Samples", 
                               "miRNA Primary Tumor", "miRNA Recurrent Tumor", "miRNA Normal Samples", 
                               "miRNA Metastasis Samples"])
    for pro in projects:
        out_dict = {}
        df_clin = pd.read_csv("{}/{} Clinical Data Summary.csv".format(clin_dir, pro), header=0, index_col=0)
        out_dict["Clinical Data"] = df_clin.shape[0]

        df_mrna = pd.read_csv("{}/{} mRNA-seq Reads.csv".format(mrna_dir, pro), header=0, index_col=0)
        out_dict["mRNA Samples"] = df_mrna.shape[1]
        mrna_samp_types = [x.split("_")[1] for x in df_mrna.columns]
        mrna_counter = Counter(mrna_samp_types)
        out_dict["mRNA Primary Tumor"] = mrna_counter["01"] + mrna_counter["03"] + mrna_counter["05"]
        out_dict["mRNA Recurrent Tumor"] = mrna_counter["02"] + mrna_counter["04"]
        out_dict["mRNA Normal Samples"] = mrna_counter["11"] + mrna_counter["10"]
        out_dict["mRNA Metastasis Samples"] = mrna_counter["06"] + mrna_counter["07"]

        df_mir = pd.read_csv("{}/{} miRNA-seq RPM Reads.csv".format(mirna_dir, pro), header=0, index_col=0)
        out_dict["miRNA Samples"] = df_mir.shape[1]
        mirna_samp_types = [x.split("_")[1] for x in df_mir.columns]
        mirna_counter = Counter(mirna_samp_types)
        out_dict["miRNA Primary Tumor"] = mirna_counter["01"] + mirna_counter["03"] + mirna_counter["05"]
        out_dict["miRNA Recurrent Tumor"] = mirna_counter["02"] + mirna_counter["04"]
        out_dict["miRNA Normal Samples"] = mirna_counter["11"] + mirna_counter["10"]
        out_dict["miRNA Metastasis Samples"] = mirna_counter["06"] + mirna_counter["07"]

        df.loc[pro] = out_dict
    df.to_csv("Project Summary.csv")