In [126]:
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import Counter


import sys
sys.path.append('../')
sys.path.append('../phase')
from phase.input_output import PhaseData
from phase.qc import OutlierDetector

In [127]:
data_dir = '../../DATA/ssc.hg38'
#data_dir = '../../DATA/platinum_sim'
#phase_name = 'X'
phase_name = None
#phase_name = 'noDEL'

colors = ['#4db6acff', '#ef6c00ff', '#b3a77dff', '#695d46ff', '#ce93d8ff', '#009668ff', ]

In [128]:
phase_data = PhaseData(data_dir, phase_name)
sibpairs = phase_data.get_sibpairs()
print('sibpairs', len(sibpairs))

crossovers = phase_data.get_crossovers()
print('crossovers', len(crossovers))

sibpairs 1932
crossovers 263558


In [129]:
crossover_lengths = np.array([co['end_pos']-co['start_pos'] for co in crossovers])
is_mat = np.array([co['is_mat'] for co in crossovers])
is_hts = np.array([co['is_hts'] for co in crossovers])

In [133]:
mat_crossovers = np.array([x['maternal_crossovers'] for x in sibpairs if x['is_fully_phased']])
pat_crossovers = np.array([x['paternal_crossovers'] for x in sibpairs if x['is_fully_phased']])

is_ibd_outlier = np.array([x['is_ibd_outlier'] for x in sibpairs if x['is_fully_phased']], dtype=bool)
is_outlier = np.array([x['is_crossover_outlier'] for x in sibpairs if x['is_fully_phased']], dtype=bool)

print('outliers', np.sum(is_outlier))

x_min, x_max = np.min(mat_crossovers[~is_ibd_outlier])-5, np.max(mat_crossovers[~is_ibd_outlier])+5
y_min, y_max = np.min(pat_crossovers[~is_ibd_outlier])-5, np.max(pat_crossovers[~is_ibd_outlier])+5


['SSC03070', 'SSC02497', 'SSC02729', 'SSC00003', 'SSC00936', 'SSC00798', 'SSC02852', 'SSC00827', 'SSC02333', 'SSC00180', 'SSC02646', 'SSC00901', 'SSC00428', 'SSC01958', 'SSC01121', 'SSC00317', 'SSC00542', 'SSC02078', 'SSC02142', 'SSC00081', 'SSC01957', 'SSC00543', 'SSC00056', 'SSC00975', 'SSC09982', 'SSC00904', 'SSC00342', 'SSC02077', 'SSC00699', 'SSC00678', 'SSC02165', 'SSC02361', 'SSC02876', 'SSC01951', 'SSC02643', 'SSC02037', 'SSC00227', 'SSC00160', 'SSC00727', 'SSC02484', 'SSC03135', 'SSC02013', 'SSC00029', 'SSC00035', 'SSC02839', 'SSC02293', 'SSC02564', 'SSC00009', 'SSC00902', 'SSC02062', 'SSC02644', 'SSC02629', 'SSC02645', 'SSC02884', 'SSC02725', 'SSC00875', 'SSC00828', 'SSC01045', 'SSC02197', 'SSC03712', 'SSC01000', 'SSC02554', 'SSC01100', 'SSC00803', 'SSC00251', 'SSC04432', 'SSC00344', 'SSC00800', 'SSC03133', 'SSC02015', 'SSC02140', 'SSC00426', 'SSC03440', 'SSC00769', 'SSC00468', 'SS0013029', 'SSC00129', 'SSC01276', 'SSC02676', 'SSC02198', 'SSC02681', 'SSC03274', 'SSC02384', 'S

KeyError: 'maternal_crossovers'

In [None]:
plt.hist(np.log10(crossover_lengths), 
         alpha=0.5, bins=np.arange(0, 8, 0.1), color=colors[4])
plt.hist(np.log10(crossover_lengths[is_hts & is_mat]), 
         alpha=0.5, bins=np.arange(0, 8, 0.1))
plt.hist(np.log10(crossover_lengths[is_hts & ~is_mat]), 
         alpha=0.5, bins=np.arange(0, 8, 0.1))

plt.xlabel('Crossover resolution in bp', fontsize=18)
plt.xticks(np.arange(0, 9), ['1' if i==0 else '$10^{%d}$' % i for i in np.arange(0, 9)], fontsize=15)
plt.yticks([])
plt.show()

In [None]:
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.hist(mat_crossovers, bins=np.arange(max(x_max, y_max)), alpha=0.5, label='maternal', log=True)
plt.hist(pat_crossovers, bins=np.arange(max(x_max, y_max)), alpha=0.5, label='paternal', log=True)
plt.hist(mat_crossovers[is_outlier], bins=np.arange(max(x_max, y_max)), color='red', log=True)
plt.hist(pat_crossovers[is_outlier], bins=np.arange(max(x_max, y_max)), color='red', log=True)
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(mat_crossovers[~is_outlier], bins=np.arange(np.min(pat_crossovers[~is_outlier])-1, np.max(mat_crossovers[~is_outlier])+1), 
         alpha=0.5, label='maternal')
plt.hist(pat_crossovers[~is_outlier], bins=np.arange(np.min(pat_crossovers[~is_outlier])-1, np.max(mat_crossovers[~is_outlier])+1), 
         alpha=0.5, label='paternal')
plt.legend()
plt.show()

In [None]:
plt.hist(mat_crossovers[~is_outlier], bins=np.arange(np.min(pat_crossovers[~is_outlier])-1, np.max(mat_crossovers[~is_outlier])+1, 1), 
         alpha=0.5, label='maternal', color=colors[0])
plt.hist(pat_crossovers[~is_outlier], bins=np.arange(np.min(pat_crossovers[~is_outlier])-1, np.max(mat_crossovers[~is_outlier])+1, 1), 
         alpha=0.5, label='paternal', color=colors[1])
plt.legend(fontsize=15)
plt.xlabel('Crossovers per family', fontsize=18)
plt.xticks(fontsize=15)
plt.ylabel('Families', fontsize=18)
plt.yticks(fontsize=15)

if phase_name is None:
    plt.axvline(84, color='black', linestyle='--')
    plt.axvline(56, color='black', linestyle='--')

plt.savefig('../plots/crossover_count.png', bbox_inches="tight")


In [None]:
print(np.median(mat_crossovers[~is_outlier]))
print(np.median(pat_crossovers[~is_outlier]))

print(np.mean(mat_crossovers[~is_outlier]))
print(np.mean(pat_crossovers[~is_outlier]))

In [None]:
is_way_out = (mat_crossovers > 3*np.median(mat_crossovers)) | (pat_crossovers > 3*np.median(pat_crossovers))
detector = OutlierDetector(mat_crossovers[~is_ibd_outlier & ~is_way_out], 
                           pat_crossovers[~is_ibd_outlier & ~is_way_out], 10)
scores = detector.score_samples(mat_crossovers, pat_crossovers)
outlier_cutoff, m, b = detector.outlier_cutoff, detector.m, detector.b
bins = detector.bins

plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.hist(np.clip(scores, min(bins[0], outlier_cutoff), bins[-1]), bins=bins, log=True)
plt.hist(np.clip(scores[is_outlier], min(bins[0], outlier_cutoff), bins[-1]), bins=bins, log=True, color='red')
plt.plot([outlier_cutoff, bins[-1]], [np.exp(m*outlier_cutoff+b), np.exp(m*bins[-1]+b)])
plt.axvline(outlier_cutoff)

plt.subplot(1, 3, 2)
xintervals = np.arange(x_min, x_max)
yintervals = np.arange(y_min, y_max)
gridx, gridy = np.meshgrid(xintervals, yintervals)
predict = detector.score_samples(gridx.flatten(), gridy.flatten())
plt.contour(xintervals, yintervals, predict.reshape(len(yintervals), len(xintervals)))

plt.scatter(mat_crossovers, pat_crossovers, marker='.', alpha=0.25)
plt.scatter(mat_crossovers[is_outlier], pat_crossovers[is_outlier], marker='.', color='red')
plt.xlabel('Maternal Crossovers')
plt.ylabel('Paternal Crossovers')

plt.subplot(1, 3, 3)
xintervals = np.arange(np.min(mat_crossovers[~is_outlier])-10, np.max(mat_crossovers[~is_outlier])+10)
yintervals = np.arange(np.min(pat_crossovers[~is_outlier])-10, np.max(pat_crossovers[~is_outlier])+10)
gridx, gridy = np.meshgrid(xintervals, yintervals)
predict = detector.score_samples(gridx.flatten(), gridy.flatten())
plt.contour(xintervals, yintervals, predict.reshape(len(yintervals), len(xintervals)))

plt.scatter(mat_crossovers[~is_outlier], pat_crossovers[~is_outlier], marker='.', alpha=0.25)
plt.xlabel('Maternal Crossovers')
plt.ylabel('Paternal Crossovers')

plt.show()

In [None]:
print(mat_crossovers[is_outlier])
print(pat_crossovers[is_outlier])
print(scores[is_outlier])

In [None]:
print(np.max(mat_crossovers[~is_outlier]))

In [113]:
print(Counter([x['chrom'] for x in crossovers if x['is_hts']]))

Counter({'1': 2367, '7': 1941, '17': 1502, '12': 1376, '16': 1265, '2': 1200, '15': 1061, '10': 882, '9': 860, '11': 741, '3': 674, '8': 630, '19': 619, '6': 596, '20': 517, '22': 507, '18': 447, '4': 387, '5': 355, '13': 231, '14': 175, '21': 92})
