In [2]:
import pandas as pd
import distance
import os

### 1. Generating data for each of the tRNA types

In [50]:
def extract_adj_list(aa, d=1): # d - hamming distance we are interested in. For standard graphs d=1.
    
    df = pd.DataFrame.from_csv('../copies_free_db/' + aa + '/stems.txt', sep='\t', index_col=None)
    df['Sequence'] = df.Stem_1 + df.Stem_2 + df.Stem_3 + df.Stem_4
    df = df.drop(['Stem_1','Stem_2','Stem_3','Stem_4', 'Variable_stem'],axis=1)
    df.columns = ['Species','Sequence']
    df = df.groupby(['Sequence'])['Species'].apply(lambda x: ', '.join(x)).reset_index()
    #grouping by sequence, but recording all the corresponding species
    
    aa_list = list(df.Sequence)
    sp_list = list(df.Species)
    
    aa_adj_list = []

    for ind1 in range(len(aa_list)):
        temp = []
        for ind2 in range(len(aa_list)):
            if ind1 != ind2 and distance.hamming(aa_list[ind1], aa_list[ind2]) <= d and ind2 not in temp:
                temp.append(ind2)
        aa_adj_list.append(temp)
    
    aa_freq = []

    for ind1 in range(len(aa_list)):
        counter = 0
        for ind2 in range(len(df)):
            if df.Sequence[ind2] == aa_list[ind1]:
                counter +=1
        aa_freq.append(counter)
    
    result = pd.DataFrame(columns = ['Adjacency','Frequency'], index = sp_list)
    result['Adjacency'] = aa_adj_list
    result['Frequency'] = aa_freq
    
    return result

In [51]:
for i,aa in enumerate(os.listdir('../copies_free_db/')):
    if '.' not in aa and 'ReadMe' not in aa:
        print i, '\t', aa
        # 1-edit distance database
        db = extract_adj_list(aa, 1)
        db.to_csv('../data/1/'+aa+'.txt',sep='\t')
        # <= 2-edit distance database
        db = extract_adj_list(aa, 2)
        db.to_csv('../data/2/'+aa+'.txt',sep='\t')
        # <= 3-edit distance database
        db = extract_adj_list(aa, 3)
        db.to_csv('../data/3/'+aa+'.txt',sep='\t')

ile
gly
ala
met
phe
gln
his
cys
tyr
lys
glu
trp
thr
leu
ser
arg
sel
val
pro
asn
asp


### 2. Generating Meta Graph

In [76]:
aas = ['ala','asp','val','pro','ser','glu','leu','thr','arg','gln','ile','asn','his','lys','cys','phe','tyr','met','trp']

aa = 'gly'
df = pd.DataFrame.from_csv('../copies_free_db/gly/stems.txt', sep='\t', index_col=None)
df['Sequence'] = df.Stem_1 + df.Stem_2 + df.Stem_3 + df.Stem_4
df = df.drop(['Stem_1','Stem_2','Stem_3','Stem_4', 'Variable_stem'],axis=1)
df.columns = ['Species','Sequence']
df.Species = df.Species.apply(lambda x: x+'_'+aa)
df = df.groupby(['Sequence'])['Species'].apply(lambda x: (', ').join(x)).reset_index()
    
for aa in aas: 
    tempDF = pd.DataFrame.from_csv('../copies_free_db/' + aa + '/stems.txt', sep='\t', index_col=None)
    
    tempDF['Sequence'] = tempDF.Stem_1 + tempDF.Stem_2 + tempDF.Stem_3 + tempDF.Stem_4
    tempDF = tempDF.drop(['Stem_1','Stem_2','Stem_3','Stem_4', 'Variable_stem'],axis=1)
    tempDF.columns = ['Species','Sequence']
    tempDF.Species = tempDF.Species.apply(lambda x: x+'_'+aa)
    tempDF = tempDF.groupby(['Sequence'])['Species'].apply(lambda x: (', ').join(x)).reset_index()
    
    df = df.append(tempDF)

#grouping by sequence, but recording all the corresponding species
df = df.groupby(['Sequence'])['Species'].apply(lambda x: (', ').join(x)).reset_index()

aa_list = list(df.Sequence)
sp_list = list(df.Species)

In [None]:
aa_adj_list = []

for ind1 in range(len(df)):
    temp = []
    for ind2 in range(len(df)):
        if ind1 != ind2 and distance.hamming(df.Sequence[ind1], df.Sequence[ind2]) <= 1 and ind2 not in temp:
            temp.append(ind2)
    aa_adj_list.append(temp)

aa_freq = []

for ind1 in range(len(df)):
    counter = 0
    for ind2 in range(len(df)):
        if df.Sequence[ind2] == df.Sequence[ind1]:
            counter +=1
    aa_freq.append(counter)

result = pd.DataFrame(columns = ['Adjacency','Frequency'], index = sp_list)
result['Adjacency'] = aa_adj_list
result['Frequency'] = aa_freq

In [None]:
result.to_csv('../data/1/metaGraph.txt',sep='\t')

### 3. Generating JSON for D3 visualization