In [1]:
import numpy as np
import os
from sklearn.externals import joblib
from sklearn import preprocessing
from sklearn.feature_selection import SelectPercentile, f_classif
import pprint

## Parse aa index to extract the physio-chemical properties of the aas into features
## http://www.genome.jp/aaindex/

def create_aa_feature_dictionary(aa_index):
    """
    Creates a dictionary that connects the feature vector to the corresponding amino acid
    Out: Dictionary: AA : Feature Vector
    """
    aas = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
    feature_dic = {}
    for x in range(0, len(aas)):
        feature_dic.update({aas[x]: get_as_features(x, aa_index)})
    return feature_dic



def parse_aa_index(file):
    """
    Parse aa_index
    Out: List of aa Properties
    """
    input_file = open(file, "r")
    lines = input_file.read().splitlines()
    totalaaIndex = []
    counter = 0
    for i in range(len(lines)):
        if lines[i][0] == "I":
            bothLines = lines[i + 1] + lines[i + 2]
            partielaaIndex = bothLines.split()
            totalaaIndex.append(partielaaIndex)
            counter += 1
    input_file.close()
    return totalaaIndex

def get_as_features(x, aa_index):
    """
    Get features of each amino acid 
    Out: List of amino acid features
    """
    features_for_as = []
    for feature in clean_aa_index_properties(aa_index):
        features_for_as.append(feature[x])
    return features_for_as

def clean_aa_index_properties(aa_index):
    """
    Remove properties with NA and float the list
    Out: processed list of aa index properties
    """
    rmv = []
    for feature in aa_index:
        if 'NA' in feature:
            rmv.append(feature)
    for feature in rmv:
        aa_index.remove(feature)
    lst = []
    for feature in aa_index:
        b = list(map(float, feature))
        lst.append(b)
    return lst

aa_index = parse_aa_index("aa_index.txt")
aa_index_properties = clean_aa_index_properties(aa_index)
aa_features = create_aa_feature_dictionary(aa_index_properties)

pprint.pprint(aa_features)

{'A': [4.35,
       0.61,
       1.18,
       1.56,
       1.0,
       0.77,
       0.37,
       0.357,
       52.6,
       16.0,
       44.0,
       7.3,
       3.9,
       -0.2,
       0.691,
       8.249,
       4.349,
       6.5,
       0.486,
       0.288,
       0.52,
       0.046,
       -0.368,
       0.71,
       -0.118,
       0.0,
       0.0,
       0.0,
       0.0,
       0.0,
       0.0,
       91.5,
       115.0,
       25.0,
       0.38,
       0.2,
       0.66,
       1.42,
       0.83,
       0.74,
       1.29,
       1.2,
       0.7,
       0.52,
       0.86,
       0.75,
       0.67,
       0.74,
       0.06,
       0.076,
       0.035,
       0.058,
       0.64,
       -0.45,
       -0.08,
       0.36,
       0.17,
       0.02,
       0.75,
       1.33,
       1.0,
       0.6,
       2.5,
       8.6,
       100.0,
       1.56,
       1.26,
       0.25,
       0.67,
       0.0,
       0.0,
       89.09,
       297.0,
       1.8,
       9.69,
       2.34,
       0.31,

       0.91,
       0.72,
       0.79,
       1.15,
       1.17,
       1.43,
       11.7,
       -0.15,
       1.068,
       1.372,
       1.022,
       0.822,
       0.87,
       0.932,
       1.266,
       1.038,
       1.89,
       3.02,
       -0.41,
       1.6,
       0.42,
       0.429,
       0.424,
       6.3,
       6.0,
       5.7,
       4.6,
       4.9,
       0.87,
       -97.0,
       -78.0,
       -47.0,
       -29.0,
       0.0,
       45.0,
       248.0,
       1.5,
       0.27,
       1.12,
       4.9,
       5.2,
       3.8,
       5.5,
       4.7,
       7.05,
       8.57,
       8.71,
       7.91,
       3.5,
       3.37,
       4.46,
       2.8,
       5.14,
       5.73,
       6.38,
       5.5,
       0.0,
       114.4,
       117.3,
       0.1263,
       -1.1,
       -0.2,
       -2.84,
       0.7,
       -0.48,
       1.35,
       2.0,
       -3.6,
       -0.1233,
       -0.0552,
       0.0047,
       0.016,
       0.37,
       0.41,
       0.916,
       0.892

       1.45,
       0.94,
       2.39,
       52.6,
       -0.34,
       -0.41,
       1.6,
       1.6,
       1.8,
       0.3,
       0.9,
       0.6,
       0.4,
       0.6,
       0.5,
       0.5,
       0.5,
       0.1,
       0.2,
       0.2,
       3.9,
       1.2,
       0.6,
       -8.6,
       -5.2,
       -6.8,
       -8.3,
       -4.2,
       -3.9,
       -3.4,
       5.7,
       -1.2,
       5.7,
       5.9,
       4.9,
       5.6,
       0.3,
       62.9,
       0.72,
       0.0,
       0.0,
       0.0,
       0.0,
       0.16,
       0.37,
       -0.073,
       -0.017,
       0.591,
       0.1,
       -0.67,
       0.5,
       0.36,
       0.725,
       1.015,
       1.848,
       0.901,
       4.259,
       0.5,
       2.355,
       1.349,
       0.051,
       0.39,
       0.049,
       0.00499,
       0.0,
       -7.85,
       7.99,
       0.92,
       0.41,
       -0.16,
       0.36,
       0.27,
       7.9,
       2.39,
       2.23,
       -5.36,
       0.3,
       7.

       -0.3,
       -12.366,
       -9.666,
       2.55,
       0.61,
       -1.71,
       -9.97,
       10.68,
       1.23,
       1.13,
       0.77,
       0.75,
       0.95,
       0.95,
       1.01,
       1.08,
       1.2,
       1.0,
       0.73,
       0.83,
       1.18,
       0.66,
       1.27,
       0.86,
       5.7,
       -3.31,
       10.8,
       5.72,
       2.12,
       11.96,
       13.28,
       9.93,
       1.79,
       4.88,
       0.391,
       0.058,
       0.407,
       36.8,
       1.23,
       0.77,
       0.96,
       1.0,
       0.7,
       0.26,
       0.12,
       -0.17,
       -0.19,
       0.03,
       -0.11,
       0.16,
       0.23,
       0.37,
       0.47,
       0.28,
       0.41,
       0.45,
       0.03,
       0.08,
       -0.09,
       0.04,
       -0.29,
       -0.46,
       -0.59,
       -0.55,
       -0.51,
       -0.33,
       -0.44,
       -0.39,
       -0.43,
       -0.42,
       -0.2,
       0.33,
       0.0,
       0.14,
       0.45,
   

       2.2,
       0.2,
       38.3,
       41.7,
       207.9,
       0.6,
       0.74,
       1.56,
       1.14,
       2.12,
       0.88,
       0.92,
       2.16,
       60.1,
       22.0,
       49.0,
       0.4,
       -0.5,
       0.09,
       2.02,
       0.043,
       104.0,
       3.0,
       397.0,
       0.89,
       0.62,
       0.64,
       0.33,
       1.117,
       1.006,
       0.93,
       -3.6,
       0.0,
       5.9,
       6.24,
       0.31,
       58.7,
       -3.5,
       -0.87,
       0.2,
       1.98,
       117.5,
       207.1,
       1.45,
       5.0,
       0.1,
       0.9,
       0.76,
       1.28,
       0.95,
       0.73,
       1.25,
       0.42,
       0.6,
       0.54,
       0.62,
       11.42,
       0.64,
       0.74,
       3.14,
       2.62,
       1.11,
       0.87,
       13.28,
       0.8,
       -1.6,
       -4.2,
       -3.0,
       0.98,
       1.04,
       70.0,
       1.7,
       0.77,
       0.72,
       1.38,
       4.33,
       2.33,
  

       3.2,
       -0.12,
       0.728,
       8.274,
       4.396,
       6.9,
       0.262,
       0.362,
       0.68,
       0.291,
       -1.03,
       1.06,
       0.124,
       1.0,
       1.0,
       1.0,
       5.0,
       0.0,
       1.0,
       202.0,
       225.0,
       90.0,
       0.01,
       0.0,
       0.95,
       0.98,
       0.93,
       1.01,
       0.44,
       1.25,
       0.34,
       1.24,
       0.9,
       0.9,
       0.89,
       1.05,
       0.07,
       0.106,
       0.099,
       0.085,
       1.05,
       -0.24,
       -0.09,
       -0.52,
       -0.7,
       -0.42,
       0.7,
       0.79,
       0.74,
       0.79,
       7.5,
       4.9,
       65.0,
       0.59,
       0.38,
       -1.76,
       -2.1,
       10.0,
       -0.96,
       174.2,
       238.0,
       12.5,
       8.99,
       1.82,
       -1.01,
       2.34,
       0.69,
       6.13,
       7.82,
       1.52,
       6.24,
       11.1,
       0.04,
       4.0,
       3.0,
       1.0,
      

       0.74,
       -7.0,
       -3.0,
       10.0,
       20.0,
       34.0,
       79.0,
       174.0,
       0.7,
       0.39,
       -0.7,
       6.0,
       6.1,
       5.6,
       5.3,
       5.1,
       3.83,
       5.36,
       8.95,
       5.16,
       4.98,
       5.55,
       5.62,
       5.0,
       4.45,
       5.46,
       7.12,
       5.08,
       0.0,
       119.6,
       121.5,
       0.0941,
       -0.08,
       0.65,
       1.81,
       2.31,
       0.19,
       1.05,
       7.0,
       -0.7,
       0.0589,
       0.0239,
       0.1462,
       -0.163,
       0.02,
       -0.08,
       1.017,
       1.023,
       1.018,
       0.992,
       0.822,
       0.988,
       1.11,
       0.832,
       1.189,
       2.18,
       2.45,
       10.3,
       -0.0701,
       120.0,
       126.0,
       0.871,
       -0.02,
       0.89,
       0.63,
       0.38,
       0.21,
       0.18,
       -0.34,
       -0.26,
       0.45,
       -0.4,
       49.26,
       -0.289,
       -3.99

       0.463,
       0.05481,
       1.0,
       -16.19,
       8.07,
       0.2,
       0.86,
       -0.33,
       -0.12,
       -0.61,
       5.3,
       -5.88,
       -4.75,
       3.65,
       0.85,
       9.9,
       5.7,
       17.19,
       16.87,
       0.275,
       0.31,
       21.67,
       2.1,
       5.89,
       17.1,
       0.91,
       0.9,
       1.06,
       0.68,
       0.94,
       1.26,
       1.1,
       1.68,
       1.1,
       0.68,
       0.68,
       1.52,
       1.57,
       1.0,
       1.0,
       0.58,
       0.58,
       1.06,
       0.91,
       1.25,
       12.4,
       -0.45,
       0.904,
       1.186,
       0.938,
       0.796,
       0.69,
       0.91,
       0.755,
       0.671,
       1.1,
       6.1,
       -0.48,
       0.7,
       0.58,
       0.268,
       0.291,
       1.3,
       1.2,
       1.2,
       1.0,
       24.2,
       1.8,
       59.0,
       69.0,
       102.0,
       118.0,
       116.0,
       130.0,
       179.0,
       0.9,
  

In [2]:
len(aa_features.get('A'))

531