In [424]:
import numpy as np
from hmmlearn import hmm
import collections

In [427]:
startprob = np.array([0.6, 0.3, 0.1])
transmat = np.array([[0.7, 0.2, 0.1], [0.3, 0.5, 0.2], [0.3, 0.3, 0.4]])
means = np.array([[0.0, 0.0], [3.0, -3.0], [5.0, 10.0]])
covars = np.tile(np.identity(2), (3, 1, 1))
model = hmm.GaussianHMM(3, "full", startprob, transmat)
model.means_ = means
model.covars_ = covars
X, Z = model.sample(100)

NotFittedError: This GaussianHMM instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [24]:
n_components = 2
n_features = 3
h = hmm.MultinomialHMM(n_components)
h.n_features = n_features
h.startprob_ = np.array([0.6, 0.4])
h.transmat_ = np.array([[0.7, 0.3], [0.4, 0.6]])
h.emissionprob_ = np.array([[0.1, 0.4, 0.5], [0.6, 0.3, 0.1]])

# My own implementation of Viterbi algorithm

In [None]:
n_components = 2
n_features = 3

start_prob = np.array([0.6, 0.4])
transmat = np.array([[0.7, 0.3], [0.4, 0.6]])
emission_prob = np.array([[0.1, 0.4, 0.5], [0.6, 0.3, 0.1]])


In [25]:
X = [[0], [1], [2]]
logprob, state_sequence = h.decode(X, algorithm="viterbi")

In [26]:
logprob

-4.309519943887134

In [27]:
state_sequence

array([1, 0, 0])

In [28]:
states, outputs = h.sample(100)

In [330]:
print(states)

NameError: name 'states' is not defined

In [37]:
np.random.binomial(100, 0.6)

60

# Phase from trio

In [1]:
import random

def output_SNP():
    alphabet = ["A", "T", "G", "C"]
    pos = [random.randint(0, len(alphabet) - 1) for i in range(2)]
    return [alphabet[i] for i in pos]

In [2]:
output_SNP()

['A', 'C']

In [3]:
def create_genome(snp_length):
    genome = []
    for i in range(snp_length):
        genome.append(output_SNP())
    return genome

In [4]:
def cross_over(length):
    start = random.randint(0, 1)
    switch_over = random.randint(0, length - 1)
    selection = [start]
    for i in range(length - 1):
        if i < switch_over:
            selection.append(start)
        else:
            selection.append(int(not start))
    return selection

In [6]:
def apply_cross_over(cross_over, genome):
    return [genome[i][j] for i, j in zip(range(len(genome)), cross_over)]

In [29]:
def dephase(genome):
    dephased_genome = []
    for snp in genome:
        new_snp = [i for i in snp]
        random.shuffle(new_snp)
        dephased_genome.append(new_snp)
    return dephased_genome

In [47]:
length = 6

genome1 = create_genome(length)
genome2 = create_genome(length)
offspring = reproduce(genome1, genome2)

dephased_genome1 = dephase(genome1)
dephased_genome2 = dephase(genome2)
dephased_offspring = dephase(offspring)

In [76]:
import vcf

In [77]:
help(vcf)

Help on package vcf:

NAME
    vcf - A VCFv4.0 and 4.1 parser for Python.

DESCRIPTION
    Online version of PyVCF documentation is available at http://pyvcf.rtfd.org/

PACKAGE CONTENTS
    filters
    model
    parser
    sample_filter
    test (package)
    utils

DATA
    RESERVED_FORMAT = {'AHAP': 'Integer', 'CN': 'Integer', 'CNL': 'Float',...
    RESERVED_INFO = {'1000G': 'Flag', 'AA': 'String', 'AC': 'Integer', 'AF...
    VERSION = '0.6.8'

FILE
    /home/picrin/programming/haplo_phasing/venv/lib/python3.6/site-packages/vcf/__init__.py




In [50]:
print(genome1)
print(genome2)
print(offspring)

[['C', 'C'], ['G', 'A'], ['G', 'G'], ['G', 'C'], ['C', 'T'], ['T', 'A']]
[['A', 'A'], ['T', 'A'], ['G', 'C'], ['C', 'T'], ['T', 'T'], ['G', 'A']]
[('C', 'A'), ('A', 'T'), ('G', 'G'), ('G', 'C'), ('C', 'T'), ('T', 'G')]


In [423]:
def phase_offspring(genome1, genome2, offspring):
    possibilities = []
    phased_offspring = []
    for snp_o, snp_g1, snp_g2 in zip(offspring, genome1, genome2):
        snp_o = [i for i in snp_o]
        possibility = [[], []]
        for k in range(0, 2):
            if snp_o[k] in snp_g1:
                possibility[k].append(0)
            if snp_o[k] in snp_g2:
                possibility[k].append(1)
        possibilities.append(possibility)
        if possibility[0] == [0] or possibility[1] == [1]:
            phased_offspring.append(snp_o + ["|"])
        elif possibility[1] == [0] or possibility[0] == [1]:
            phased_offspring.append(snp_o[::-1] + ["-|"])
        else:
            phased_offspring.append(snp_o + ["?"])
    return phased_offspring

In [380]:
print(genome1  )
print(genome2  )
print(offspring)

[['C', 'C'], ['G', 'A'], ['G', 'G'], ['G', 'C'], ['C', 'T'], ['T', 'A']]
[['A', 'A'], ['T', 'A'], ['G', 'C'], ['C', 'T'], ['T', 'T'], ['G', 'A']]
[('C', 'A'), ('A', 'T'), ('G', 'G'), ('G', 'C'), ('C', 'T'), ('T', 'G')]


In [381]:
phase_offspring(dephased_genome1, dephased_genome2, dephased_offspring)

[['C', 'A', '-|'],
 ['A', 'T', '|'],
 ['G', 'G', '?'],
 ['G', 'C', '-|'],
 ['C', 'T', '-|'],
 ['T', 'G', '|']]

# Load data from the Jewish trio

In [405]:
def load_vcf(filename, limit=-1, chrom_limit=""):
    genotype = {}
    for i, record in enumerate(vcf.Reader(filename=filename)):
        if record.CHROM == chrom_limit:
            break
        if i == limit:
            break
        snp = record.samples[0].gt_bases
        if record.samples[0].phased:
            a, b = snp.split("|")
        else:
            a, b = snp.split("/")
        genotype[(record.CHROM, record.POS)] = a, b
    return genotype

In [406]:
son = load_vcf("son_jewish.vcf", chrom_limit="chr2")

In [407]:
mother = load_vcf("mother_jewish.vcf", chrom_limit="chr2")

In [408]:
father = load_vcf("father_jewish.vcf", chrom_limit="chr2")

In [409]:
son_genome = []
father_genome = []
mother_genome = []
shared_SNPs = sorted(list(set(son).intersection(set(mother)).intersection(set(father))))
for chr_coord in shared_SNPs:
    son_genome.append(son[chr_coord])
    father_genome.append(father[chr_coord])
    mother_genome.append(mother[chr_coord])

In [410]:
collections.Counter([i[0] for i in shared_SNPs])

Counter({'chr1': 170171})

In [411]:
r = [i[1] for i in shared_SNPs if i[0] == "chr1"]

In [412]:
length = r[-1] - r[0]

In [413]:
length

248097755

In [415]:
length/len(r)

1457.932050701941

In [422]:
phased_genome = phase_offspring(father_genome, mother_genome, son_genome)

['TACACACAC', 'T']
['TTG', 'TTGTG']
['A', 'AAC']
['C', 'CTT']
['C', 'CA']
['C', 'CTAAA']
['C', 'CT']
['T', 'TAC']
['CTTTA', 'C']
['G', 'GA']
['CTTATTTAT', 'C']
['GAGAG', 'C']
['TATAATA', 'T']
['A', 'AAATAATAAT']
['CAAAT', 'C']
['CTGTG', 'CTG']
['CA', 'C']
['A', 'AT']
['GTATTA', 'G']
['TTA', 'T']
['T', 'TACAC']
['T', 'TTGTGTGTGTG']
['C', 'CA']
['CTT', 'C']
['C', 'CGGGG']
['T', 'TGCC']
['A', 'AAAAATAAAATAAAATAAAAT']
['C', 'CA']
['CA', 'C']
['T', 'TACAC']
['A', 'AT']
['CCTTCCTTTCTTT', 'C']
['CTTTTTT', 'C']
['CT', 'C']
['T', 'TCAA']
['C', 'CTT']
['C', 'CAGAT']
['ATT', 'A']
['GACAA', 'G']
['ATGTG', 'A']
['T', 'TACAC']
['TCACACA', 'T']
['AAGAG', 'A']
['AT', 'A']
['A', 'AT']
['AT', 'A']
['GTTTT', 'G']
['T', 'TATAG']
['CCCTT', 'CCCTTCCTT']
['GACACAC', 'G']
['A', 'ATG']
['ATATCTATC', 'ATATC']
['GGTGTGT', 'G']
['CT', 'C']
['C', 'CTGTTGT']
['TGATAGATA', 'T']
['GTTA', 'G']
['A', 'AAT']
['AT', 'A']
['A', 'AT']
['A', 'ATG']
['C', 'CA']
['C', 'CA']
['C', 'CA']
['G', 'GACAC']
['AT', 'A']
['CA', 'C']
[

In [420]:
collections.Counter([i[-1] for i in phased_genome])

Counter({'?': 135010, '|': 34735, '-|': 426})

In [418]:
length/collections.Counter([i[-1] for i in phased_genome])["|"]

7142.586872031093