In [1]:
import numpy as np 
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import json

In [2]:
data = pd.read_csv('reduced_features_data.csv', index_col=0)

# need to convert stringified codon sequence back to list
def clean_list(list_string):
    return list_string.replace('[','').replace(']','').replace("'", '').split()

data['codon_array'] = data['codon_array'].apply(clean_list)

In [3]:
codon_list = ['AAA', 'AAT', 'AAC', 'AAG', 'ATA', 'ATT', 'ATC', 'ATG', 'ACA', 'ACT',
              'ACC', 'ACG', 'AGA', 'AGT', 'AGC', 'AGG', 'TAA', 'TAT', 'TAC', 'TAG',
              'TTA', 'TTT', 'TTC', 'TTG', 'TCA', 'TCT', 'TCC', 'TCG', 'TGA', 'TGT',
              'TGC', 'TGG', 'CAA', 'CAT', 'CAC', 'CAG', 'CTA', 'CTT', 'CTC', 'CTG',
              'CCA', 'CCT', 'CCC', 'CCG', 'CGA', 'CGT', 'CGC', 'CGG', 'GAA', 'GAT',
              'GAC', 'GAG', 'GTA', 'GTT', 'GTC', 'GTG', 'GCA', 'GCT', 'GCC', 'GCG',
              'GGA', 'GGT', 'GGC', 'GGG']

### Again convert to markdown so I don't generate different encodings
codon_list.sort()
codon_one_hot = {}
for idx, codon in enumerate(codon_list):
    one_hot = np.zeros(64)
    one_hot[idx] = 1
    # convert to list for storage in json
    codon_one_hot[codon] = list(one_hot)
    

import json
with open('codon_one_hot.json', 'w') as f:
    # sort and indent for to make more prettier
    json.dump(codon_one_hot, f, sort_keys=True, indent=4)

In [4]:
with open('codon_one_hot.json', 'r') as fp:
    codon_one_hot = json.load(fp)

In [5]:
def list_to_one_hot_matrix(codon_array):
    # mark for garbage if codon_array is nan
    if str(codon_array[0]) == 'nan':
        return

    # otherwise return color matrix
    one_hot_matrix = [codon_one_hot[codon]
                      for codon in codon_array]

    return one_hot_matrix

data['one_hot_matrix'] = data['codon_array'].apply(list_to_one_hot_matrix)

In [6]:
one_hot_data = data[['prest_id', 'uniprot_id', 'conc_cf', 'one_hot_matrix']]
one_hot_data.shape

(45206, 4)

### This df is way too big to store in a single file on github. I will split it into several smaller files.

In [7]:
split_points = np.arange(start=0, stop=one_hot_data.shape[0],
                         step=one_hot_data.shape[0]/25)
split_points = [int(point) for point in split_points]

for idx, start in enumerate(split_points):
    end = None
    j = idx + 1
    if j < len(split_points):
        end = split_points[j]
    outfile = f'one_hot_data/DF_one_hot_{j}.csv'
    one_hot_data.loc[start:end, :].to_csv(outfile)