# Idea:
Categories: Mammals, Plants, Bacteria, Other Vertebrates, Virus

Predictions: Given a 64-dimensional vector of codon frequencies in genes from an organism, predict which kingdom the organism is from.

Expansions: Expand to RNA data

In [60]:
import sklearn
import pandas
import matplotlib
import numpy
import scipy
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

In [36]:
"""
    Get the codon data from a file
"""
def get_codon_data(fname):
    vectors = list()
    with open(fname, 'r') as f:
        while True:
            org_name = f.readline().strip()
            codon_freqs = f.readline().strip()
            if not org_name or not codon_freqs:
                break
            vectors.append([int(x) for x in codon_freqs.split(' ')])
    return vectors

"""
    Normalize data to PDF
"""
def normalize_data(vectors):
    output_vecs = []
    for vector in vectors:
        vector = list(map(lambda x : x/sum(vector) if sum(vector) > 0.0 else x, vector))
        output_vecs.append(vector)
    return output_vecs

In [37]:
#get list of codons
codons = []
with open('codon_data/CODON_LABEL.txt') as f:
    codons = f.read().split('\n')[1].strip().split(' ')

In [43]:
#get and normalize data for mammals
mammal_data = get_codon_data('codon_data/gbmam.spsum')
normalized_mammal_data = normalize_data(mammal_data)
norm_mam_dataframe = pandas.DataFrame(numpy.array(normalized_mammal_data), columns=codons)
norm_mam_dataframe = norm_mam_dataframe.assign(label=pandas.Series(['mammal' for x in range(len(mammal_data))]).values)

In [45]:
norm_mam_dataframe

Unnamed: 0,CGA,CGC,CGG,CGU,AGA,AGG,CUA,CUC,CUG,CUU,...,UUU,AUA,AUC,AUU,AUG,UGG,UAA,UAG,UGA,label
0,0.009174,0.006881,0.004587,0.009174,0.011468,0.011468,0.000000,0.025229,0.022936,0.009174,...,0.018349,0.011468,0.034404,0.011468,0.029817,0.034404,0.002294,0.002294,0.000000,mammal
1,0.013005,0.001979,0.000707,0.000565,0.000283,0.000000,0.077891,0.019649,0.007775,0.016398,...,0.022477,0.058948,0.054142,0.039299,0.006644,0.001979,0.002969,0.000283,0.023466,mammal
2,0.015789,0.005263,0.000000,0.000000,0.002632,0.000000,0.089474,0.018421,0.005263,0.028947,...,0.023684,0.028947,0.081579,0.036842,0.002632,0.005263,0.000000,0.000000,0.026316,mammal
3,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.086842,0.026316,0.002632,0.010526,...,0.028947,0.031579,0.076316,0.021053,0.002632,0.002632,0.000000,0.000000,0.028947,mammal
4,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.094737,0.021053,0.002632,0.013158,...,0.026316,0.031579,0.078947,0.021053,0.005263,0.000000,0.000000,0.000000,0.031579,mammal
5,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.084211,0.026316,0.010526,0.013158,...,0.018421,0.031579,0.073684,0.023684,0.005263,0.002632,0.000000,0.000000,0.028947,mammal
6,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.084211,0.031579,0.007895,0.007895,...,0.018421,0.034211,0.071053,0.023684,0.002632,0.000000,0.000000,0.000000,0.031579,mammal
7,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.089474,0.026316,0.002632,0.013158,...,0.023684,0.034211,0.076316,0.021053,0.002632,0.000000,0.000000,0.000000,0.031579,mammal
8,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.086842,0.031579,0.007895,0.007895,...,0.021053,0.034211,0.078947,0.021053,0.002632,0.000000,0.000000,0.000000,0.031579,mammal
9,0.012638,0.005266,0.000000,0.002633,0.002633,0.000000,0.073196,0.024223,0.007899,0.015271,...,0.031596,0.034229,0.075829,0.021590,0.002633,0.002633,0.000000,0.000000,0.028963,mammal


In [46]:
#plants
plant_data = get_codon_data('codon_data/gbpln.spsum')
normalized_plant_data = normalize_data(plant_data)
norm_plt_dataframe = pandas.DataFrame(numpy.array(normalized_plant_data), columns=codons)
norm_plt_dataframe = norm_plt_dataframe.assign(label=pandas.Series(['plant' for x in range(len(plant_data))]).values)

#bacteria
bacteria_data = get_codon_data('codon_data/gbbct.spsum')
normalized_bacteria_data = normalize_data(bacteria_data)
norm_bct_dataframe = pandas.DataFrame(numpy.array(normalized_bacteria_data), columns=codons)
norm_bct_dataframe = norm_bct_dataframe.assign(label=pandas.Series(['bacteria' for x in range(len(bacteria_data))]).values)

#virus
virus_data = get_codon_data('codon_data/gbvrl.spsum')
normalized_virus_data = normalize_data(virus_data)
norm_vrl_dataframe = pandas.DataFrame(numpy.array(normalized_virus_data), columns=codons)
norm_vrl_dataframe = norm_vrl_dataframe.assign(label=pandas.Series(['virus' for x in range(len(virus_data))]).values)

In [47]:
norm_plt_dataframe

Unnamed: 0,CGA,CGC,CGG,CGU,AGA,AGG,CUA,CUC,CUG,CUU,...,UUU,AUA,AUC,AUU,AUG,UGG,UAA,UAG,UGA,label
0,0.022329,0.003190,0.009569,0.012759,0.015949,0.003190,0.017544,0.007974,0.012759,0.023923,...,0.044657,0.028708,0.009569,0.041467,0.014354,0.019139,0.003190,0.000000,0.003190,plant
1,0.018367,0.004082,0.010204,0.006122,0.018367,0.002041,0.016327,0.006122,0.006122,0.024490,...,0.069388,0.020408,0.022449,0.046939,0.010204,0.016327,0.000000,0.000000,0.002041,plant
2,0.015779,0.003945,0.011834,0.007890,0.017751,0.001972,0.015779,0.005917,0.005917,0.027613,...,0.065089,0.019724,0.019724,0.057199,0.009862,0.013807,0.001972,0.000000,0.000000,plant
3,0.015842,0.001980,0.007921,0.011881,0.023762,0.001980,0.013861,0.001980,0.005941,0.029703,...,0.067327,0.021782,0.021782,0.057426,0.011881,0.017822,0.001980,0.000000,0.000000,plant
4,0.017751,0.000000,0.007890,0.007890,0.017751,0.005917,0.019724,0.003945,0.003945,0.027613,...,0.063116,0.017751,0.017751,0.051282,0.009862,0.013807,0.000000,0.000000,0.001972,plant
5,0.017717,0.001969,0.011811,0.009843,0.015748,0.003937,0.019685,0.001969,0.007874,0.021654,...,0.064961,0.019685,0.017717,0.051181,0.011811,0.013780,0.000000,0.000000,0.001969,plant
6,0.021654,0.003937,0.005906,0.005906,0.013780,0.005906,0.013780,0.000000,0.003937,0.021654,...,0.076772,0.021654,0.021654,0.047244,0.009843,0.015748,0.001969,0.000000,0.000000,plant
7,0.021526,0.000000,0.005871,0.007828,0.013699,0.005871,0.017613,0.007828,0.001957,0.029354,...,0.064579,0.017613,0.017613,0.050881,0.009785,0.009785,0.001957,0.000000,0.000000,plant
8,0.015717,0.001965,0.011788,0.005894,0.017682,0.005894,0.019646,0.001965,0.003929,0.021611,...,0.068762,0.019646,0.015717,0.053045,0.011788,0.015717,0.000000,0.000000,0.001965,plant
9,0.015748,0.001969,0.007874,0.009843,0.021654,0.001969,0.015748,0.000000,0.005906,0.029528,...,0.066929,0.025591,0.019685,0.057087,0.011811,0.017717,0.001969,0.000000,0.000000,plant


In [51]:
data = pandas.concat([norm_mam_dataframe, norm_plt_dataframe, norm_bct_dataframe, norm_vrl_dataframe])

In [52]:
data

Unnamed: 0,CGA,CGC,CGG,CGU,AGA,AGG,CUA,CUC,CUG,CUU,...,UUU,AUA,AUC,AUU,AUG,UGG,UAA,UAG,UGA,label
0,0.009174,0.006881,0.004587,0.009174,0.011468,0.011468,0.000000,0.025229,0.022936,0.009174,...,0.018349,0.011468,0.034404,0.011468,0.029817,0.034404,0.002294,0.002294,0.000000,mammal
1,0.013005,0.001979,0.000707,0.000565,0.000283,0.000000,0.077891,0.019649,0.007775,0.016398,...,0.022477,0.058948,0.054142,0.039299,0.006644,0.001979,0.002969,0.000283,0.023466,mammal
2,0.015789,0.005263,0.000000,0.000000,0.002632,0.000000,0.089474,0.018421,0.005263,0.028947,...,0.023684,0.028947,0.081579,0.036842,0.002632,0.005263,0.000000,0.000000,0.026316,mammal
3,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.086842,0.026316,0.002632,0.010526,...,0.028947,0.031579,0.076316,0.021053,0.002632,0.002632,0.000000,0.000000,0.028947,mammal
4,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.094737,0.021053,0.002632,0.013158,...,0.026316,0.031579,0.078947,0.021053,0.005263,0.000000,0.000000,0.000000,0.031579,mammal
5,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.084211,0.026316,0.010526,0.013158,...,0.018421,0.031579,0.073684,0.023684,0.005263,0.002632,0.000000,0.000000,0.028947,mammal
6,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.084211,0.031579,0.007895,0.007895,...,0.018421,0.034211,0.071053,0.023684,0.002632,0.000000,0.000000,0.000000,0.031579,mammal
7,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.089474,0.026316,0.002632,0.013158,...,0.023684,0.034211,0.076316,0.021053,0.002632,0.000000,0.000000,0.000000,0.031579,mammal
8,0.013158,0.005263,0.000000,0.002632,0.002632,0.000000,0.086842,0.031579,0.007895,0.007895,...,0.021053,0.034211,0.078947,0.021053,0.002632,0.000000,0.000000,0.000000,0.031579,mammal
9,0.012638,0.005266,0.000000,0.002633,0.002633,0.000000,0.073196,0.024223,0.007899,0.015271,...,0.031596,0.034229,0.075829,0.021590,0.002633,0.002633,0.000000,0.000000,0.028963,mammal


In [58]:
train, test = train_test_split(data, test_size=0.3)

In [59]:
train

Unnamed: 0,CGA,CGC,CGG,CGU,AGA,AGG,CUA,CUC,CUG,CUU,...,UUU,AUA,AUC,AUU,AUG,UGG,UAA,UAG,UGA,label
3240,0.000000,0.000000,0.000000,0.014599,0.014599,0.000000,0.000000,0.000000,0.007299,0.029197,...,0.007299,0.007299,0.043796,0.058394,0.014599,0.021898,0.007299,0.000000,0.000000,bacteria
5620,0.017682,0.007859,0.009823,0.011788,0.021611,0.013752,0.015717,0.005894,0.013752,0.025540,...,0.027505,0.029470,0.017682,0.039293,0.013752,0.017682,0.001965,0.000000,0.000000,plant
4620,0.028436,0.004739,0.009479,0.061611,0.014218,0.004739,0.014218,0.004739,0.004739,0.018957,...,0.014218,0.028436,0.014218,0.023697,0.014218,0.004739,0.004739,0.000000,0.000000,plant
3735,0.006048,0.004032,0.004032,0.020161,0.020161,0.002016,0.014113,0.006048,0.008065,0.018145,...,0.020161,0.006048,0.026210,0.034274,0.028226,0.000000,0.000000,0.000000,0.002016,plant
6268,0.004686,0.003749,0.005623,0.006560,0.016870,0.012184,0.009372,0.007498,0.019681,0.033739,...,0.024367,0.012184,0.012184,0.037488,0.026242,0.018744,0.000937,0.000000,0.000937,plant
611,0.002513,0.000000,0.020101,0.010050,0.027638,0.005025,0.000000,0.010050,0.002513,0.010050,...,0.027638,0.015075,0.002513,0.032663,0.007538,0.000000,0.002513,0.000000,0.000000,plant
10855,0.009115,0.003906,0.002604,0.005208,0.009115,0.002604,0.006510,0.005208,0.005208,0.029948,...,0.066406,0.039062,0.010417,0.052083,0.035156,0.020833,0.000000,0.001302,0.000000,plant
1361,0.014512,0.008795,0.005277,0.004398,0.024626,0.013193,0.021548,0.028144,0.016711,0.010114,...,0.009675,0.012753,0.016711,0.009235,0.014072,0.027704,0.000000,0.001759,0.000000,virus
819,0.015896,0.005264,0.000000,0.000105,0.002632,0.000000,0.057059,0.030003,0.002421,0.037793,...,0.030214,0.037057,0.028424,0.055795,0.002737,0.001158,0.000000,0.000000,0.030319,mammal
2134,0.005383,0.004731,0.001369,0.013741,0.009348,0.002449,0.010519,0.004214,0.003282,0.017880,...,0.032887,0.017360,0.013185,0.050428,0.027775,0.010054,0.002430,0.000542,0.000526,bacteria


In [65]:
model = MLPClassifier(verbose=True, max_iter = 500)
model.fit(train.loc[:,'CGA':'UGA'], train.loc[:,'label'])

Iteration 1, loss = 1.20228372
Iteration 2, loss = 1.05998977
Iteration 3, loss = 0.95459678
Iteration 4, loss = 0.86415146
Iteration 5, loss = 0.77770544
Iteration 6, loss = 0.69346624
Iteration 7, loss = 0.61825637
Iteration 8, loss = 0.55831910
Iteration 9, loss = 0.51147546
Iteration 10, loss = 0.47509845
Iteration 11, loss = 0.44637563
Iteration 12, loss = 0.42331701
Iteration 13, loss = 0.40312052
Iteration 14, loss = 0.38599122
Iteration 15, loss = 0.37111488
Iteration 16, loss = 0.35819536
Iteration 17, loss = 0.34634924
Iteration 18, loss = 0.33604765
Iteration 19, loss = 0.32672675
Iteration 20, loss = 0.31810513
Iteration 21, loss = 0.31017178
Iteration 22, loss = 0.30321801
Iteration 23, loss = 0.29667158
Iteration 24, loss = 0.29051947
Iteration 25, loss = 0.28454864
Iteration 26, loss = 0.27969340
Iteration 27, loss = 0.27496720
Iteration 28, loss = 0.27037905
Iteration 29, loss = 0.26603393
Iteration 30, loss = 0.26184138
Iteration 31, loss = 0.25832432
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [68]:
test

Unnamed: 0,CGA,CGC,CGG,CGU,AGA,AGG,CUA,CUC,CUG,CUU,...,UUU,AUA,AUC,AUU,AUG,UGG,UAA,UAG,UGA,label
2986,0.003654,0.029189,0.016069,0.011667,0.001365,0.002818,0.002201,0.015145,0.056837,0.005987,...,0.005987,0.001145,0.029189,0.008629,0.018887,0.010434,0.000660,0.000308,0.001673,bacteria
2010,0.007275,0.000661,0.001984,0.003968,0.028439,0.008598,0.015873,0.004630,0.004630,0.005291,...,0.011243,0.033069,0.027116,0.019180,0.023810,0.013889,0.001984,0.000000,0.000000,virus
1021,0.019569,0.003914,0.001957,0.011742,0.017613,0.003914,0.009785,0.003914,0.001957,0.031311,...,0.043053,0.027397,0.013699,0.039139,0.015656,0.017613,0.000000,0.000000,0.001957,plant
10735,0.005069,0.008646,0.002982,0.002683,0.012522,0.010733,0.003876,0.025939,0.010435,0.022660,...,0.007752,0.011628,0.041145,0.026237,0.024747,0.013417,0.001789,0.000000,0.000894,plant
530,0.011696,0.000000,0.003899,0.009747,0.011696,0.000000,0.017544,0.007797,0.000000,0.015595,...,0.089669,0.023392,0.007797,0.046784,0.013645,0.011696,0.001949,0.000000,0.000000,plant
8735,0.023810,0.003968,0.003968,0.009921,0.019841,0.005952,0.019841,0.005952,0.009921,0.023810,...,0.047619,0.027778,0.009921,0.037698,0.017857,0.015873,0.000000,0.000000,0.001984,plant
2668,0.010097,0.004829,0.006585,0.006585,0.019315,0.010975,0.014486,0.001317,0.014047,0.013608,...,0.035119,0.021949,0.003951,0.022827,0.016242,0.012730,0.001756,0.000439,0.000439,virus
1487,0.018587,0.001859,0.003717,0.013011,0.024164,0.005576,0.014870,0.009294,0.016729,0.027881,...,0.063197,0.033457,0.018587,0.039033,0.014870,0.016729,0.000000,0.001859,0.000000,plant
3737,0.006024,0.002008,0.004016,0.022088,0.022088,0.002008,0.014056,0.006024,0.004016,0.020080,...,0.020080,0.006024,0.026104,0.032129,0.030120,0.000000,0.000000,0.000000,0.002008,plant
9688,0.001101,0.033040,0.004405,0.008811,0.000000,0.003304,0.004405,0.020925,0.041850,0.005507,...,0.003304,0.002203,0.042952,0.005507,0.022026,0.006608,0.002203,0.000000,0.000000,plant


In [85]:
test.iloc[[0]]['label']

2986    bacteria
Name: label, dtype: object

In [90]:
model.predict(test.iloc[[0]].loc[:,'CGA':'UGA'])

array(['bacteria'], 
      dtype='<U8')

In [89]:
for i in range(len(test)):
    print("{0} {1}".format(str(test.iloc[[i]]['label'].values[0]), str(model.predict(test.iloc[[i]].loc[:,'CGA':'UGA'])[0])))

bacteria bacteria
virus virus
plant plant
plant plant
plant plant
plant plant
virus virus
plant plant
plant plant
plant plant
plant plant
virus virus
plant plant
virus plant
plant plant
plant plant
virus virus
bacteria bacteria
plant plant
virus bacteria
plant plant
plant plant
bacteria bacteria
plant plant
plant plant
virus virus
plant plant
plant plant
plant plant
plant plant
virus virus
plant plant
mammal mammal
plant plant
plant plant
virus virus
plant plant
plant plant
plant plant
plant plant
bacteria bacteria
plant plant
plant plant
virus virus
virus virus
plant plant
bacteria bacteria
plant plant
plant plant
virus virus
plant plant
bacteria bacteria
virus virus
plant plant
bacteria bacteria
plant virus
plant plant
plant plant
plant plant
mammal mammal
bacteria bacteria
plant plant
plant virus
plant plant
plant plant
plant plant
virus virus
bacteria bacteria
plant plant
virus virus
plant plant
bacteria plant
plant plant
plant plant
mammal mammal
plant plant
virus virus
plant plan

## Yay!

It works!