In [8]:
import os
import csv
import numpy as np

class Result:
    
    def __init__(self, name, age):
        self.name = name
        self.counts = dict()
        self.snp_sums = dict()
        self.age = age
        self.snps = dict()
        
    def set_count(self, kmer_name, count):
        self.counts[kmer_name] = count
        
    def normalized_tel(self):
        return (self.tel_count()/self.counts['CTGGGCGACAGAGCGAGACTCCGTC'])
    
    def tel_count(self):
        return self.counts['TTAGGGTTAGGGTTAGGGTTAGGGT']
        
    def get_label(self):
        return "{}:{}, {}".format(self.name[7:], 
                              int(self.counts['GAGTCAAAAGCAGATACGTGTGTCA']), int(self.counts['GAGTCAAAAGCAGATACATGTGTCA']))
        
    def __str__(self):
        return 'Result {}, age: {}, {} | {}'.format(self.name, self.age, self.tel_count(), self.normalized_tel())
    
    def __repr__(self):
        return 'Result {}, age: {}, {} | {}'.format(self.name, self.age, self.tel_count(), self.normalized_tel())
    
    def get_snp_vals(self):
        result = list()
        for k,s in self.snps.iteritems():
            if s['main']:
                result.append(1)
            else:
                result.append(0)
        return result
        
    
    def get_input(self):
        output = list()
        #output.append(self.tel_count())
        output.append(self.age)
        snps = self.get_snp_vals()
        #output = output+snps
        return output
    
    def calculate_snps(self):
        with open ('./static/snp_kmers.txt') as f:
            reader = csv.reader(f, delimiter='\t')
            for row in reader:
                name = row[0]
                alt = False
                if name[0:4] == 'ALT_':
                    name = name[4:].split(':')[0]
                    alt = True
                else:
                    name = name.split(':')[0]
                
                if name not in self.snps:
                    self.snps[name] = dict()
                    self.snps[name]['main'] = 0
                    self.snps[name]['minor'] = 0
                    
                count = int(self.counts[row[1].strip()])
                    
                if alt:
                    self.snps[name]['minor'] += count
                else:
                    self.snps[name]['main'] += count
                    

                    
        
        
ages = dict()
with open('./static/SraRunTable.txt') as run_table:
    r = csv.reader(run_table, delimiter='\t')
    r.next()
    for row in r:
        ages[row[8]] = row[11]
        
def get_age(name):
    return int(ages[name])
    

results = list()
for file in os.listdir('./files'):
    name = file[:-4]
    try:
        age = get_age(name)
    except:
        continue
    result = Result(name, age)
    with open('./files/'+file) as f:
        reader = csv.reader(f, delimiter='\t')  
        for row in reader:
            result.set_count(row[0], float(row[1]))
    result.calculate_snps()
    results.append(result)



In [9]:
%matplotlib nbagg

import matplotlib.pyplot as plt

x = list(map(lambda r: r.age, results))
y = list(map(lambda r: r.normalized_tel(), results))
labels = list(map(lambda r: r.get_label(), results))

#print x,y
plt.scatter(x, y)
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

for label, x, y in zip(labels, x,y):
    break
    plt.annotate(
        label,
        xy=(x, y), xytext=(-20, 20),
        textcoords='offset points', ha='right', va='bottom',
        bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
        arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))

plt.show()
print "y=%.6fx+(%.6f)"%(z[0],z[1])

<IPython.core.display.Javascript object>

y=0.006040x+(6.971442)


In [11]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation, Embedding, Input, Conv2D,MaxPooling1D, LSTM,SimpleRNN,Reshape
from keras.layers.convolutional import Conv1D
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical,normalize

inputs = list(map(lambda r: r.get_input(), results))
outputs = list(map(lambda r: r.age, results))

inputs = np.array(inputs)
outputs = np.array(outputs)

print inputs[0]
print outputs[0]


model = Sequential()
a = inputs.shape[1]

print inputs[0].shape

#model.add(Embedding(output_dim=128, input_dim=700))
model.add(Dense(inputs[0].shape[0], input_shape=inputs[0].shape, activation = "hard_sigmoid"))
model.add(Dense(1, activation='linear')) 


model.compile(loss='mse',
              optimizer='rmsprop', metrics=["accuracy"])

history = model.fit(inputs, outputs, epochs=100,  batch_size=48, validation_split=0.1, verbose=1)


[42]
42
(1,)
Train on 198 samples, validate on 23 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Ep

In [12]:
%matplotlib nbagg

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

<IPython.core.display.Javascript object>

In [84]:
%matplotlib nbagg

plt.semilogy(history.history['loss'])
plt.semilogy(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

<IPython.core.display.Javascript object>