In [2]:
# import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import configparser

%matplotlib inline

In [3]:
class FastQ():
    
    def __init__(self, filename):
        self.filename = filename
        self._sequences = {}
    
    def parse_file(self, num_lines=100):
        with open(self.filename, 'r') as f:
            data = []

            for i, line in enumerate(f):
                if not line in '@+':
                    data.append(line)
                if i == num_lines:
                    break   
                
            # Now the lines you want are alternating, so you can make a dict
            # from key/value pairs of lists data[0::2] and data[1::2]
            sequences = dict(zip(data[0::2], data[1::2]))
        
        self._sequences = sequences
        return self._sequences

In [4]:
config = configparser.ConfigParser()
config.read('config.ini')
Ndata = config['data']['ndata']
Tdata = config['data']['tdata']

fastN = FastQ(Ndata)
fastT = FastQ(Tdata)

  from ipykernel import kernelapp as app


In [5]:
fastN.parse_file(num_lines=1000)
fastT.parse_file(num_lines=1000)

{'+\n': '@<@DFAD<DFDFHBHD<CH>DGHCEDHH9EHCHGGG9GGBHCF@DEHGADHIEEG*?FD:C48=4C)8@/;4=?<(6=CC;/,((,,55::@:AC91@3@C\n',
 '@HWI-ST807:461:C2P0JACXX:6:1101:10260:1999 1:N:0:GCCAAT\n': 'TTTTCAGATATATTTAAAGATTTTACTCTTCAACATGAACAGAAGGGGGGTTGTTTTAAACTAATGAACTCTTCTTTTTTCATAACTAATCCTCTCAAAAA\n',
 '@HWI-ST807:461:C2P0JACXX:6:1101:10267:1975 1:N:0:NCCAAT\n': 'TGTATCTGTTGTACTCATAATCTCGCNCNTAAAATCGATCATAGTCTCCCCTGTATCGATCATAGAAATTACGGTAACCCTGAGAAAGGAAAAGAAAAAGN\n',
 '@HWI-ST807:461:C2P0JACXX:6:1101:10402:1976 1:N:0:NCCAAT\n': 'TAGAAGTTAATGAATAGCATCTAATTTTTATTAATGTGTTTATTTCAGTTATCAAGTGGGAATCCTGTATATGAAAAATACTATAGACAGGTAAGATTTTN\n',
 '@HWI-ST807:461:C2P0JACXX:6:1101:10533:1983 1:N:0:GCCAAT\n': 'TCCTCGGCTCTCCCCAGGGGCAGGGCTTTGGGACGCAAAGCCACCAGCCCCATCACTGCCTCCATGCCCACGCTGTTAGGCTCGGAACCTGAGGCGTGGGG\n',
 '@HWI-ST807:461:C2P0JACXX:6:1101:10623:1989 1:N:0:GCCAAT\n': 'TTCAGCAGGTCCTTGACCACAGCAGCCTCCCTCAACAGCCCAGGTCAGGCCACCCCAACCCCCGGCTCAGGGTGACTCACTGCGGCCAGCTTGTCAAAGCG\n',
 '@HWI-ST807:461:C2P0JACXX:6:1101:1075:2

In [6]:
T_dicts = []
N_dicts = []

for ix, key in enumerate(fastT._sequences):
    T_dicts.append(Counter(fastT._sequences[key]))
    if ix == 1000:
        break

for ix, key in enumerate(fastN._sequences):
    N_dicts.append(Counter(fastN._sequences[key]))
    if ix == 1000:
        break
    


T_frequencies = {}
for d in T_dicts:
    for key in d:
        if key not in T_frequencies and key in 'ACGT':
            T_frequencies[key] = d[key]
        elif key not in 'ACGT':
            pass
        else:
            T_frequencies[key] += d[key]

T_frequencies['total'] = sum([T_frequencies[key] for key in T_frequencies])
print("T:", sorted(T_frequencies))
print(sorted([T_frequencies[key] / T_frequencies['total'] for key in T_frequencies if key != 'total']))




N_frequencies = {}
for d in N_dicts:
    for key in d:
        if key not in N_frequencies and key in 'ACGT':
            N_frequencies[key] = d[key]
        elif key not in 'ACGT':
            pass
        else:
            N_frequencies[key] += d[key]
    
N_frequencies['total'] = sum([N_frequencies[key] for key in N_frequencies])
print("N:", sorted(N_frequencies))
print(sorted([N_frequencies[key] / N_frequencies['total'] for key in N_frequencies if key != 'total']))

('T:', ['A', 'C', 'G', 'T', 'total'])
[0, 0, 0, 0]
('N:', ['A', 'C', 'G', 'T', 'total'])
[0, 0, 0, 0]


## Neural Net
You'd need to feed the neural net the diseased cell sequencing, and also which parts of those sequencing indicate that it's diseased. 


Or rather, the normal sequencing, and which sections of the sequencing map to the diseased sequencing sections.

In [7]:
Nlocal_frequencies = {}

for key in fastN._sequences:
    if key not in Nlocal_frequencies:
        Nlocal_frequencies[fastN._sequences[key]] = [(X, Counter(fastN._sequences[key])[X]) for X in Counter(fastN._sequences[key]) if X in 'ACGT']

Nlocal_frequencies

{'AAAAAGAAATAGGATCTTTTTCCTTCAGCAAACTAGATTTGACTTGCAATCTTCTGACTTCCTGTGTATGTTTTTATTGGGACAGGAAACCTTCAAGCTCC\n': [('A',
   29),
  ('C', 20),
  ('T', 35),
  ('G', 17)],
 'AAAAATATATATATATACTCACTGAATTGGCATTTGTTGGGTTTGGCCAAGGTCTACCACCACCTGGACCCCTACAAAACAATTTGATAAATGAAATTTTA\n': [('A',
   36),
  ('C', 19),
  ('T', 31),
  ('G', 15)],
 'AAAAATGTTCCAAGGAAAAGGTNGANNCNATTGTCTCTTTCTCTTTCTGCTCTGATGTAGTGCGTGTGCTAAGCTCAGGTCTGAGCACTGGCGGATCCCCN\n': [('A',
   21),
  ('C', 22),
  ('G', 24),
  ('T', 29)],
 'AAAATGTGCTTCTTACAGGAATATAAATAGTTTCTGGAAAGGACACTGACAACTTCAAAGCAAAATGAAGCTCTTTTGGTTGCTTTTCACCATTGGGTTCT\n': [('A',
   32),
  ('C', 17),
  ('T', 33),
  ('G', 19)],
 'AAACCAGTTGCTCAACAATCATGGCTATCGCCCATCCCCCATATGTCTGGTAGGTGACACCTCAAGCCACTCGTAACTCCTCCTCCTCCTCCTCCTGTTCC\n': [('A',
   21),
  ('C', 40),
  ('T', 26),
  ('G', 14)],
 'AAAGAAAGCCTGCCATTTAGTGAGAATATACTGGCCAGGTATTTATAGTCTCAAGGTGGGGCATTTTCTGATTTGAGTGGATTTCAGAATCGTGGGCCTTG\n': [('A',
   26),
  ('C', 15),
  ('T', 32),
  ('G', 28)],
 'AAAGACATGAATCTGTAAAT

In [8]:
vals = [len(fastN._sequences)]
x = []
for i in [fastN._sequences[key] for key in fastN._sequences]:
    for j in [fastT._sequences[key] for key in fastT._sequences]:
        if len(i) == len(j):
            x.append([k for k in range(len(i)) if i[k] != j[k]])


print(x[:3])

            
n = 4

def max_sum_subsequence(seq):
    maxsofar = 0
    maxendinghere = 0
    for s in seq:
        # invariant: maxendinghere and maxsofar are accurate
        # are accurate up to s
        maxendinghere = max(maxendinghere + s, 0)
        maxsofar = max(maxsofar, maxendinghere)
    return maxsofar

# print("diffed:", len(x))

# table = {}
# for row in x:
#     for col in row:
#         if str(col) in table:
#             table[str(col)] += 1
#         else:
#             table[str(col)] = 0

# print(table)

# plt.hist([table[key] for key in table])

[[0, 2, 3, 6, 8, 10, 12, 13, 14, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 47, 49, 50, 51, 52, 54, 55, 56, 58, 59, 62, 64, 67, 70, 71, 73, 74, 75, 77, 78, 80, 81, 83, 84, 86, 87, 88, 89, 90, 93, 96, 97, 99, 100], [0, 1, 4, 5, 6, 7, 8, 10, 11, 13, 14, 15, 16, 18, 20, 21, 22, 23, 26, 27, 29, 30, 31, 32, 33, 34, 38, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 64, 66, 68, 69, 70, 71, 72, 76, 77, 80, 81, 82, 83, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 95, 96, 99], [0, 6, 7, 10, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 36, 39, 40, 41, 42, 43, 44, 45, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 61, 64, 65, 67, 68, 69, 71, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 86, 88, 89, 90, 93, 96, 97, 98, 99]]


NameError: name 'longest_increasing_subsequence' is not defined

In [10]:
from Bio import pairwise2

In [17]:
# Needs to be done between N and T sequences
for key in fastT._sequences:
    if key not in fastN._sequences:
        Nlocal_frequencies[fastT._sequences[key]] = [(X, Counter(fastT._sequences[key])[X]) for X in Counter(fastT._sequences[key]) if X in 'ACGT']

Tlocal_frequencies

healthy_seq = list(Nlocal_frequencies.keys())
cancer_seq = list(Tlocal_frequencies.keys())

for i, key in enumerate(keyss):
    alignments = pairwise2.align.globalxx(healthy_seq, cancer_seq)
    print(pairwise2.format_alignment(*alignments[0]))
    if i == 5:
        break

CAGCAGAAAGAGAAGAAGGGGGTC----GTCCCCA-CCCG-GCAGGCACTGGCTG---TGC-CC--A-C--A--G-ACTCGGCGG-TCC----C---C-GTGT-C--CT--C------CCATGCCTAC--ACGAAG--GTCAC-

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
CA-C-------------------CCCCAG-CCCC-TCCC-TG-A-GC-CT--CT-NCNTG-TCCTGAACTGAAAGTACTC--C--CTCCTTTTCTGGCAG-G-ACGAC-AACTTAATGCC-TGCCTA-TTAC-AA-ATGT-A-N

  Score=59

C---ACCCCCAGCCCC-TCCC-TG-----AGCC-TCTN-CN--TGTC---CT-----GAACTGAA-AGTACTCCCTCCT-T---TTCTGGCAGGA-CGAC-AAC-T-TAATGCCTGCCT-A----T---------TACA-AATGT-AN-

||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
-TTTA-----A-----AT---AT-TTTTTA-CCAT-T-TC-AAT-T-TTTCTTTTTTG--CT-AATAGTA--------TATAAATT-T-----G-TC-ACTAACATGT-ATG--TG--TGAAGCATGGGAAGAAATAC-TAATG-CA-G

  Score=54

TT-TAA---AT-ATTTT-T-T-ACCA---TT-TCAATTTT--TCTTTTTTGC-TAA-TAGTAT-ATAAATTTGTCACTAACATGT

In [None]:
brea