In [None]:
import pandas as pd

project_list = ['UCEC', 'THCA', 'STAD', 'KICH', 'BRCA', 'LUAD', 'KIRC', 'HNSC', 'LUSC', 'PRAD', 'BLCA', 'LIHC', 'KIRP', 'ESCA']
df_res = pd.DataFrame(columns = ["miR-151a-3p log2FC", "miR-151a-3p padj", "miR-151a-5p log2FC", "miR-151a-5p padj"])
for proj in project_list:
    f_name = "10_TCGA_PanCancer\DESeq2\{}_TumorvsNormal_DESeq2_Results.csv".format(proj)
    df = pd.read_csv(f_name, index_col=0, header=0)
    df_res.loc[proj] = [df.at["MIMAT0000757", "log2FoldChange"], df.at["MIMAT0000757", "padj"], 
                        df.at["MIMAT0004697", "log2FoldChange"], df.at["MIMAT0004697", "padj"]]
df_res.to_csv("10_TCGA_PanCancer\DESeq2\miR-151a Results Summary.csv")

In [None]:
%matplotlib inline

import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def make_151_3p_plot(out_file, figsize=False):
    project_list = ['UCEC', 'THCA', 'STAD', 'KICH', 'BRCA', 'LUAD', 'KIRC', 
                    'HNSC', 'LUSC', 'PRAD', 'BLCA', 'LIHC', 'KIRP', 'ESCA']
    df_all = pd.DataFrame(columns=["Sample ID", "Patient", "Type", "Normalized Read Counts", "TCGA Project"])
    for proj in project_list:
        norm_file = "10_TCGA_PanCancer/DESeq2/{}_TumorvsNormal_DESeq2_Normalized_Counts.csv".format(proj)
        df = pd.read_csv(norm_file, header=0, index_col=0)
        df_151_3p = df.loc["MIMAT0000757"]
        # transpose so patient ids are index
        df_3p = df_151_3p.T.reset_index()
        df_3p.columns = ["Sample ID", "Normalized Read Counts"]
        df_3p["Patient"] = df_3p["Sample ID"].apply(lambda x: x.split("_")[0])
        df_3p["Type"] = df_3p["Sample ID"].apply(lambda x: "Normal" if "_11" in x or "_10" in x else "Tumor")
        df_3p["TCGA Project"] = proj

        df_all = df_all.append(df_3p)
    
    df_all[r"log$_2$(Read Counts)"] = df_all["Normalized Read Counts"].apply(lambda y: 0 if y==0 else math.log(y,2))
    
    sns.set(color_codes=True, style="ticks")
    
    if not figsize:
        fig, ax = plt.subplots(1,1)
    else:
        fig, ax = plt.subplots(1,1, figsize=figsize)
    
    ax = sns.boxplot(x="TCGA Project", y=r"log$_2$(Read Counts)", hue="Type", data=df_all, 
                     ax=ax, fliersize=3)
    sns.despine(offset=10)
    ax.set_title("miR-151a-3p")
    ax.set_xticklabels(project_list, rotation=90)
    ax.legend(bbox_to_anchor=(1.05, 0.5), loc=2)
    
    color = "0.15" # default seaborn "black" color
    star_displacement = 0.16
    stars_list = [ ('THCA', 3), ('STAD', 3), ('KICH', 3), ('BRCA', 2), ('KIRC', 3), ('HNSC', 1), ('LIHC', 3), ('KIRP', 3)]
    for proj, stars in stars_list:
        ind = project_list.index(proj)
        x1 = 1*ind - 0.3
        x2 = x1 + 0.6
        # height above max y value
        y = df_all[df_all["TCGA Project"] == proj][r"log$_2$(Read Counts)"].max() + 0.2
        ax.plot([x1,x2], [y, y], lw=1.5, c=color)
        ax.text((x1+x2)/2.0, y-star_displacement, "*"*stars, ha="center", va="bottom", color=color, fontsize=14)
    
    plt.savefig(out_file, bbox_inches="tight", dpi=400)