In [2]:
import numpy as np

In [34]:
def compress_file(filename):
    with open(filename, 'r') as f:
        header = next(f).strip().split('\t')
        # check that we have a typical nuclear family structure
        if tuple(header[1:5]) == ('m1_del', 'm2_del', 'p1_del', 'p2_del'):
            individuals = [header[i][:-4] for i in range(5, len(header)-3, 2)]
        else:
            raise Exception('Not a simple nuclear family.')
                            
        # pull phase
        chroms = []
        start_positions, end_positions = [], []
        states = []
        for line in f:
            pieces = line.strip().split('\t')
            chroms.append(pieces[0][3:])
            start_positions.append(int(pieces[-2]))
            end_positions.append(int(pieces[-1]))
            states.append([int(x) for x in pieces[1:-2]])
                
    chroms = np.array(chroms)
    start_positions = np.array(start_positions)
    end_positions = np.array(end_positions)
    states = np.array(states, dtype=np.int8)
    
    all_chroms = chroms[[0] + (np.where(chroms[1:]!=chroms[:-1])[0]+1).tolist()]
    print(all_chroms)
    
    with open('%s.compressed.txt' % filename, 'w+') as phasef:
        for chrom in all_chroms:
            chrom_states = states[chroms==chrom, :].T
            for i in range(4):
                chrom_states[i, chrom_states[i, :]>1] = 1

            chrom_start_positions = start_positions[chroms==chrom]
            chrom_end_positions = end_positions[chroms==chrom]

            # write final states to file
            change_indices = [-1] + np.where(np.any(chrom_states[:, 1:]!=chrom_states[:, :-1], axis=0))[0].tolist() + [chrom_states.shape[1]-1]
            for j in range(1, len(change_indices)):
                s_start, s_end = change_indices[j-1]+1, change_indices[j]
                assert np.all(chrom_states[:, s_start] == chrom_states[:, s_end])
                phasef.write('%s\t%s\t%d\t%d\n' % (
                            'chr' + chrom, 
                            '\t'.join(map(str, chrom_states[:, s_start])), 
                            chrom_start_positions[s_start], chrom_end_positions[s_end]))

    print('Write to file complete')

In [35]:
compress_file('../phased_mssng_quads_del/MSSNG00439-003.MSSNG00439-003.MSSNG00439-004.phased.txt')

['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '15' '16'
 '17' '18' '19' '20' '21' '22']
Write to file complete
