In [30]:
%matplotlib inline

%env http_proxy=http://proxy-default:3128
%env https_proxy=http://proxy-default:3128

import re
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import warnings
from scipy import signal, stats
from statsmodels.nonparametric.smoothers_lowess import lowess
#from sklearn.decomposition import PCA

import geneinfo as gi
gi.email('kaspermunch@birc.au.dk')

# Make inline plots vector graphics instead of raster graphics
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina', 'png')
#set_matplotlib_formats('pdf', 'svg')

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D

#matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

import seaborn as sns
sns.set() # sets seaborn default "prettyness:
sns.set_style("ticks")

# scale down size of default plots
sns.set_context("paper")
import matplotlib as mpl
scale = 0.8
d = dict([(k, v*scale) for (k, v) in sns.plotting_context('paper').items()])
d['figure.figsize'] = [5.4, 3.5]
mpl.rcParams.update(d)


def abline(slope, intercept, ax=None):
    "Add a straight line through the plot"
    if ax is None:
        ax = plt.gca()
    x_vals = np.array(ax.get_xlim())
    y_vals = intercept + slope * x_vals
    ax.plot(x_vals, y_vals, '--', color='grey')
    
def add_lowess(x, y, ax=None, color=None, is_sorted=True, frac=0.005, it=0, **kwargs):
    "Add a lowess curve to the plot"
    if ax is None:
        ax = plt.gca() 
    filtered = lowess(y, x, is_sorted=is_sorted, frac=frac, it=it, **kwargs)
    ax.plot(filtered[:,0], filtered[:,1])

def add_band(x_low, x_high, y_low=None, y_high=None, ax=None, color='gray', linewidth=0, alpha=0.5, zorder=0, **kwargs):
    "Plot a gray block on x interval"
    if ax is None:
        ax = plt.gca()
    if y_low is None:
        y_low, _ = ax.get_ylim()
    if y_high is None:
        _, y_high = ax.get_ylim()
    g = ax.add_patch(Rectangle((x_low, y_low), x_high-x_low, y_high-y_low, 
                 facecolor=color,
                 linewidth=linewidth,
                 alpha=alpha,
                 zorder=zorder,
                 **kwargs))

# def stairs(df, start='start', end='end', pos='pos', endtrim=0):
#     "Turn a df with start, end into one with pos to plot as stairs"
#     df1 = df.copy(deep=True)
#     df2 = df.copy(deep=True)
#     df1[pos] = df1[start]
#     df2[pos] = df2[end] - endtrim
#     return pd.concat([df1, df2]).sort_values([start, end])

def stairs(df, start='start', end='end', pos='pos', endtrim=0):
    "Turn a df with start, end into one with pos to plot as stairs"
    df1 = df.copy(deep=True)
    df2 = df.copy(deep=True)
    df1[pos] = df1[start]
    df2[pos] = df2[end] - endtrim
    df3 = pd.concat([df1, df2]).sort_values([start, end])

    df3 = df3.reset_index()
    nans = ~((df3.index.values == 0) | (df3.index.values % 2) | (df3.shift().pos == df3.pos))
    df_lst = []
    for i, gr in df3.groupby(nans.cumsum()):#.groups.items():
        df_lst.append(gr)
        df_lst.append(pd.DataFrame(dict(chrom=[np.nan], start=[np.nan], end=[np.nan], dummy=[np.nan], pos=[np.nan])))
    return pd.concat(df_lst)

# My own paired palette replacing the last brown pair with violets
sns.color_palette('Paired').as_hex()
Paired = sns.color_palette(['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c',
                            '#fdbf6f', '#ff7f00', '#cab2d6','#6a3d9a', '#e585cf', '#ad009d'])
#sns.palplot(Paired)
Infographics = sns.color_palette(['#e8615d', '#f49436', '#2d9de5', '#3bbdbd', '#634792'])
#sns.palplot(Infographics)

class left:
    def __rlshift__(self, df):
        "Left align columns of data frame: df << left()"
        left_aligned_df = df.style.set_properties(**{'text-align': 'left'})
        left_aligned_df = left_aligned_df.set_table_styles(
        [dict(selector = 'th', props=[('text-align', 'left')])])
        display(left_aligned_df)

env: http_proxy=http://proxy-default:3128
env: https_proxy=http://proxy-default:3128


In [16]:
df_list = []
for summary_file_name in glob.glob('../steps/summary/*.txt'):
    df_list.append(pd.read_csv(summary_file_name, names=['name', 'alt_model', 'p_val', 'omega']))
codeml_summary = pd.concat(df_list).reset_index(drop=True)
codeml_summary.head()

Unnamed: 0,name,alt_model,p_val,omega
0,PCSK1N,2,1.0,
1,PCSK1N,8,0.993523,
2,GPR82,2,0.783045,
3,GPR82,8,0.668773,
4,SYAP1,2,0.014439,3.98689


In [40]:
with open('../results/discarded_genes.txt') as f:
    for l in f:
        if l.startswith("SKIPPED"):
            _, chrom, gene = l.split()
            if chrom == 'chrX':
                print(gene)
                
# USP9X
# RLIM
# MSN
# CNKSR2
# ZCCHC13
# NUDT11
# AFF2

ARSD
FHL1
ZNF275
SLC25A43
LUZP4
PIN4
USP9X
TIMM8A
MAP7D3
RLIM
MSN
KRBOX4
NONO
GCNA
CNKSR2
MAGEC1
PABIR3
PAGE5
GAB3
CYBB
CLDN2
MAGEC3
PASD1
ZMAT1
MECP2
VCX3A
REPS2
HSFX1
RBMXL3
BEND2
MAP3K15
FANCB
VCX
FAM9A
SRPK3
DDX53
CXCR3
ZCCHC13
MAGEB5
TRMT2B
GAGE2A
NUDT11
SPANXD
ARMCX4
FLNA
ZNF81
RPS4X
FAM47C
SPANXC
MAGEE1
SPANXN1
SPANXA2
NAP1L6P
SPANXN5
VCX3B
RTL8B
HAUS7
CXorf49B
GAGE10
GAGE12E
ZNF630
CT47A10
GAGE12H
DCAF8L1
CT47A9
CT47A12
SPANXB1
CT47A7
CT47A4
CT47A3
GAGE12B
GAGE12C
PAGE2B
CT47A2
CCNQ
CT45A2
CT45A7
F8A2
GAGE2E
F8A3
CT45A8
CT45A6
ENSG00000278646
ENSG00000283737
AFF2
CD99
PPP2R3B
DHRSX
ASMTL
IL3RA
CSF2RA
CRLF2
ZBED1


In [34]:
signif_codeml = (codeml_summary
                 .loc[codeml_summary.p_val < 0.025 / 1]
                 .reset_index(drop=True)
                 .sort_values(['name', 'p_val'])
                 .drop_duplicates(subset=['name'])
                 .sort_values('p_val')
                )
signif_codeml << left()

Unnamed: 0,name,alt_model,p_val,omega
29,MAGEB6,8,0.0,5.60352
67,SSX3,8,0.0,7.67227
110,SCML1,8,0.0,3.75482
181,FAM9B,2,0.0,8.24793
52,TFDP3,8,0.0,4.34811
92,MAGEA2,8,0.0,3.45651
33,ESX1,8,0.0,4.4122
120,ZNF280C,8,0.0,3.07318
19,EZHIP,8,0.0,3.35951
150,SSX7,8,0.0,4.47375
