Skip to content

Commit

Permalink
allele counter fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffhsu3 committed Apr 18, 2015
1 parent a08bc47 commit 125234e
Show file tree
Hide file tree
Showing 6 changed files with 242 additions and 347 deletions.
278 changes: 191 additions & 87 deletions genda/AEI/AEI.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
from scipy.stats import ttest_ind
from textwrap import wrap

from collections import defaultdict

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from genda import calculate_minor_allele_frequency, calculate_ld
from genda.plotting import (should_not_plot, add_gene_bounderies,
add_snp_arrow)
add_snp_arrow, plot_transcript)
from genda.eQTL import plot_eQTL


Expand Down Expand Up @@ -43,7 +45,8 @@ def single_snp_aei_test2(geno, allelic_ratio, num_threshold=2):
# Het_combined must have a higher ratio than homo_combined
if len(het_combined) < num_threshold or len(homo_combined) < num_threshold:
return(1)
#elif het_combined.mean() < homo_combined.mean():return(1)
elif (het_combined.mean() > homo_combined.mean()):
return(1)
else:
return(ttest_ind(het_combined, homo_combined, equal_var=False)[1])

Expand All @@ -58,7 +61,8 @@ def dosage_round(geno, threshold = 0.5):


class AEI(object):
def __init__(self, aei_dataframe, geno, snp_annot, gene_name):
def __init__(self, aei_dataframe, geno, snp_annot, gene_name,
maf_threshold = 0.05):
"""
Arguments
---------
Expand All @@ -70,27 +74,48 @@ def __init__(self, aei_dataframe, geno, snp_annot, gene_name):
snp_annot - dataframe of SNP annotations
"""
self.aei = aei_dataframe
self.geno = geno
# :TODO calculate MAF of the snps in self.geno
# :TODO assert self.geno and self.snp_annot are same shape
self.snp_annot = snp_annot
try:
self.sample_ids = [i[0] for i in self.aei.columns][::4]
except AttributeError:
self.sample_ids = [i[0] for i in self.aei.index][::4]


ids = (pd.Index(self.sample_ids)
.intersection(self.geno.columns))
self.ids = ids
.intersection(geno.columns))
idx = pd.IndexSlice
try:
self.aei.sort_index(axis=1, inplace=True)
self.aei = self.aei.loc[:, idx[ids, :]]
self.aei.sort_index(axis=1, inplace=True)
self.geno = geno.ix[:,
self.aei.columns.get_level_values(0)[::4]]
except TypeError:
# Case where aei_dataframe is just a series
self.aei = self.aei.sort_index()
self.aei = self.aei.loc[idx[ids, :]]
self.aei = self.aei.sort_index()
self.geno = geno.ix[:,
self.aei.index.get_level_values(0)[::4]]
self.ids = self.geno.columns
self.maf = calculate_minor_allele_frequency(self.geno.ix[:, ids])
# Restrict to > 5%
self.geno = self.geno.ix[(self.maf >= maf_threshold) &\
(self.maf <= 1 - maf_threshold), :]
self.snp_annot = snp_annot.ix[self.geno.index, :]
self.maf = self.maf[self.geno.index]
self.gene_name = gene_name
self.aei = self.aei.ix[self.geno.index.intersection(self.aei.index),:]
# SNPs of interest
#self.ld = calculate_ld(self.geno.ix[:,0], snps_interest)
# :TODO Need to merge aei ids into here


def calc_aei(self, num_threshold=5, count_threshold = 30,
single_snp=None):
single_snp=None, outlier_thresh=-4.32):
"""
:TODO split this up into two functions 1 for series and one for
dataframe
Arguments
---------
eQTL - eQTL pvalues
Expand All @@ -102,108 +127,99 @@ def calc_aei(self, num_threshold=5, count_threshold = 30,
"""
base_pair_to_index = {'A':0, 'C': 1, 'G': 2, 'T': 3}
ids = self.ids
# Focusing only on single-nucleotide polymorphisms
# Focusing only on single-nucleotide polymorphisms as indicator
if type(self.aei) == pd.DataFrame:
not_indel = [i for i in self.aei.index if\
len(self.snp_annot.ix[i, 'a1']) == 1]
(len(self.snp_annot.ix[i, 'a1']) == 1)\
& len(self.snp_annot.ix[i, 'a0']) == 1]
else:
not_indel = [self.aei.name]
hets = dosage_round(self.geno.ix[not_indel, :])[ids]
pvalues_out = np.ones((self.geno.shape[0], len(not_indel)),
dtype=np.double)
geno_t = dosage_round(self.geno)
hets = geno_t.ix[not_indel, :]
m_size = (len(ids), len(not_indel))
aei_ratios = pd.DataFrame(
np.zeros(m_size, dtype=np.uint32),
index=pd.Index(ids),
index=self.geno.columns,
columns=pd.Index(not_indel))
overall_counts = pd.DataFrame(
np.zeros((len(not_indel), 2), dtype=np.uint32),
index=pd.Index(not_indel))
# 1 = False. True for outliers_m means not outliers
outliers_m = pd.DataFrame(np.ones(m_size,
dtype=bool),
index=pd.Index(ids),
index=self.geno.columns,
columns=pd.Index(not_indel))
# sufficient hets is really het counts
sufficient_hets = pd.Series(
data=np.repeat(0, len(not_indel)),
index=pd.Index(not_indel),
dtype=np.int32)
hets_dict = {}

#REF = [base_pair_to_index[i] for i in self.snp_annot.ix[not_indel, 'a0']]
#ALT = [base_pair_to_index[i] for i in self.snp_annot.ix[not_indel, 'a1']]
def _run_df(hetr):
#hi = hetr[(hetr == 1.0) & np.logical_not(pd.isnull(hetr))]
return((hetr == 1.0) & np.logical_not(pd.isnull(hetr)))

#:TODO deal with aei being class series

hets = hets.apply(_run_df, axis=1)
#print(hets)
sufficient_hets = hets.sum(axis=1)
hets = hets.ix[sufficient_hets >= num_threshold, :]
aei_t = self.aei.ix[hets.index,:]
#self.aei.apply(calc_pvalues, axis=1, args=(hets))
pvalues_out = np.ones((self.geno.shape[0], aei_t.shape[0]),
dtype=np.double)
pvalues_out = pd.DataFrame(pvalues_out, index=self.geno.index,
columns=pd.Index(not_indel))
# :TODO make this into a function
for i, j in enumerate(not_indel):
columns=aei_t.index)
aei_ratios = pd.DataFrame(
np.zeros((len(ids), len(aei_t.index)), dtype=np.uint32),
index=self.geno.columns,
columns=aei_t.index)

c = 0

for i, row in hets.iterrows():
try:
REF = base_pair_to_index[self.snp_annot.ix[j, 'a0']]
ALT = base_pair_to_index[self.snp_annot.ix[j, 'a1']]
REF = base_pair_to_index[self.snp_annot.ix[i, 'a0']]
ALT = base_pair_to_index[self.snp_annot.ix[i, 'a1']]
except KeyError:
continue
# hi = hets for the current snp
hi = hets.ix[i,:]
hi = hi[np.logical_and(hi == 1.0,
np.logical_not(pd.isnull(hi)))]
sufficient_hets[j] = len(hi)
# Need to filter based on individuals counts
if len(hi) >= num_threshold:
refi = [(k, REF) for k in hi.index]
alti = [(k, ALT) for k in hi.index]
try:
ref = np.asarray(self.aei.ix[j, refi].values,
dtype=np.float64)
alt = np.asarray(self.aei.ix[j, alti].values,
dtype=np.float64)
except pd.core.indexing.IndexingError:
# For cases where self.aei is a single row or series
ref = np.asarray(self.aei[refi].values, dtype=np.float64)
alt = np.asarray(self.aei[alti].values, dtype=np.float64)
# Filter on counts
s = ref + alt
s = s >= count_threshold
overall_counts.ix[i, :] = [np.nansum(ref), np.nansum(alt)]
allelic_ratio = alt/ref
aei_ratios.ix[hi.index, i] = pd.Series(allelic_ratio,
index=hi.index)
allelic_ratio = pd.Series(allelic_ratio,
index=hi.index)

# Flip the ratio
allelic_ratio[allelic_ratio > 1] =\
1/allelic_ratio[allelic_ratio > 1]
# Need to do something better for outliers
outliers = (np.log2(allelic_ratio) < -3.5) &\
np.logical_not(np.isinf(allelic_ratio))
outliers = np.logical_and(outliers, s)
outliers_m.ix[hi.index, i] = outliers
allelic_ratio = allelic_ratio[np.logical_not(outliers)]
if not single_snp:
geno_t = self.geno.ix[:, allelic_ratio.index]
geno_t = dosage_round(geno_t)
pvalues_out.iloc[:,i] = (geno_t.
apply(single_snp_aei_test2, axis=1,
args=(allelic_ratio,)))
else:
geno_t = self.geno.ix[single_snp, hi.index]
geno_t = dosage_round(geno_t[np.logical_not(outliers)])
pvalues_out.ix[single_snp, i] =\
single_snp_aei_test2(geno_t, allelic_ratio)
hets_dict[j] = hi.index
else:
outliers = np.repeat(False, len(hi))
pvalues_out.iloc[:, i] = np.repeat(np.nan, self.geno.shape[0])
# hi = hets index
hi = list(row[row].index)
idx = pd.IndexSlice
ref = np.asarray(aei_t.loc[i, idx[hi, REF]].values,
dtype=np.float64)
alt = np.asarray(aei_t.loc[i, idx[hi, ALT]].values,
dtype=np.float64)
s = ref + alt
s = s >= count_threshold
allelic_ratio = alt/ref
allelic_ratio = pd.Series(allelic_ratio,
index=hi)
aei_ratios.ix[hi, c] = allelic_ratio
allelic_ratio[allelic_ratio > 1] =\
1/allelic_ratio[allelic_ratio > 1]

outliers = (np.log2(allelic_ratio) < outlier_thresh) |\
np.isinf(np.log2(allelic_ratio))
outliers = np.logical_or(outliers, np.logical_not(s))
outliers_m.ix[hi, i] = outliers
allelic_ratio = allelic_ratio[np.logical_not(outliers)]
geno_i = geno_t.ix[:, allelic_ratio.index]
pvalues_out.iloc[:, c] = (geno_i.
apply(single_snp_aei_test2, axis=1,
args=(allelic_ratio,)))
hets_dict[i] = hi
c += 1
self.pvalues = pvalues_out
self.hets_dict = hets_dict
self.sufficient_hets = sufficient_hets
overall_counts['Nhets'] = sufficient_hets
self.overall_counts = overall_counts
# This never actually gets used, but there is currently an error if you
# of the local variable being unbound
self.outliers = outliers_m
self.ratios = aei_ratios
# Outliers need to be fixed only set on the most recent one atm
self.outliers = outliers_m



def aei_barplot(self, csnp, tsnp, ax=None, gene_name=None, title=True):
""" AEI barplot
Expand Down Expand Up @@ -241,10 +257,7 @@ def aei_barplot(self, csnp, tsnp, ax=None, gene_name=None, title=True):
allelic_ratio = self.ratios.ix[self.hets_dict[tsnp], tsnp]
allelic_ratio_i = np.argsort(allelic_ratio.values)
allelic_ratio = np.log2(allelic_ratio.iloc[allelic_ratio_i])
outliers = np.logical_not(np.logical_or(
allelic_ratio < -3.0 ,
allelic_ratio > 3.0
))
outliers = np.logical_not(self.outliers.ix[self.hets_dict[tsnp], tsnp])
color_geno = []
color = color[allelic_ratio_i][outliers]
for i in color:
Expand Down Expand Up @@ -272,8 +285,99 @@ def aei_within_individual(self, ax=None):
self.aei


def barplot_across_genes(self, samples, other_samples,
ax=None, bar_width = 0.35):
if ax:
pass
else:
fig, ax = plt.subplots(nrows=2 , ncols=1,
sharey=False, sharex=True,
subplot_kw=dict(axisbg='#FFFFFF'))
np.zeros((2, self.ratios.shape[1]), dtype=np.float64)
ar_samp = defaultdict(list)
ar_osamp = defaultdict(list)
for i in self.ratios.columns:
ar_samp[i] = []
ar_osamp[i] = []
for i in samples:
ar = []
for j in self.ratios.columns:
if i in self.hets_dict[j]:
c_ar = self.ratios.ix[i, j]
if c_ar <= 1:
c_ar = 1/c_ar
else: pass
if self.outliers.ix[i, j]:
pass
else:
ar_samp[j].append(c_ar)
else: pass
for i in other_samples:
for j in self.ratios.columns:
if i in self.hets_dict[j]:
c_ar = self.ratios.ix[i, j]
if c_ar <= 1:
c_ar = 1/c_ar
else: pass
if self.outliers.ix[i, j]:
pass
else:
ar_osamp[j].append(c_ar)
else: pass
smean = []
sstd = []
for snps, ratios in ar_samp.iteritems():
mdat = np.ma.masked_array(ratios, np.isnan(ratios))
smean.append(np.mean(mdat))
sstd.append(np.std(mdat))
osm = []
sosm = []
for snps, ratios in ar_osamp.iteritems():
mdat = np.ma.masked_array(ratios,np.isnan(ratios))
osm.append(np.mean(mdat))
sosm.append(np.std(mdat))
ind = np.arange(len(ar_samp.keys()))
rects1 = ax.bar(ind, smean, bar_width, color='r', yerr=sstd)
rects2 = ax.bar(ind + bar_width, osm, bar_width, color='y', yerr=sosm)
if ax:
return(ax)
else: return(fig)


def plot_aei_within_individual(self, samples,
other_samples=None, ax=None, exon_scale=1,
intron_scale=20):
if ax:
pass
else:
fig, ax = plt.subplots(nrows=1 , ncols=1,
sharey=False, sharex=True,
subplot_kw=dict(axisbg='#FFFFFF'))
counter = 0
for i in samples:
pos = []
ar = []
for j in self.ratios.columns:
if i in self.hets_dict[j]:
c_ar = self.ratios.ix[i, j]
if c_ar <= 1:
c_ar = 1/c_ar
else: pass
if self.outliers.ix[i, j]:
pass
else:
pos.append(self.snp_annot.ix[j, 'pos'])
ar.append(c_ar)
else: pass
scatter = ax.plot(pos, ar, '-o')
counter += 1
if ax:
return(ax)
else: return(fig)



def aei_plot_single(self, tsnp, ax=None, focus_snp=None):
def aei_plot_single(self, cissnp, ax=None, focus_snp=None):
"""
Arguments
---------
Expand All @@ -289,7 +393,7 @@ def aei_plot_single(self, tsnp, ax=None, focus_snp=None):
fig, ax = plt.subplots(nrows=1 , ncols=1,
sharey=False, sharex=True,
subplot_kw=dict(axisbg='#FFFFFF'))
adj_pvalue = -1*np.log10(self.pvalues.loc[:, tsnp])
adj_pvalue = -1*np.log10(self.pvalues.loc[:, cissnp])
if focus_snp:
print('focus snp')
snp = focus_snp
Expand Down
4 changes: 2 additions & 2 deletions genda/plotting/plotting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ def plot_transcript(transcript_id, paths, ax, y=0, height=2.,
for patch in ps:
ax.add_patch(patch)
# hlines doesn't in this function but outside it?
draw_arrows(xmin, xmax, y+height/2.0, ax)
ax.hlines(y+height/2.0, xmin, xmax, colors='darkgray', lw=2)
#draw_arrows(xmin, xmax, y+height/2.0, ax)
#ax.hlines(y+height/2.0, xmin, xmax, colors='darkgray', lw=2)
return(ax)


Expand Down

0 comments on commit 125234e

Please sign in to comment.