In [1]:
import numpy as np
import random
import pandas as pd

from sklearn.model_selection import train_test_split

from IPython.display import Audio, display
def allDone():
    urL = 'http://www.wavsource.com/snds_2020-03-30_7102365145747638/animals/cat_meow2.wav'
    display(Audio(url=urL, autoplay=True))
# allDone()

random.seed(666)

motifs = np.genfromtxt('data__for_nerds/motifs.csv',dtype='U')
motifxFamMatrix = np.genfromtxt('data__for_nerds/motifxFamMatrix.csv',delimiter=',',dtype=int)
fams = np.genfromtxt('data__for_nerds/fams.csv',dtype='U')

print(len(motifs))

7866


In [2]:
X_train, X_test = train_test_split(range(len(motifs)), test_size=0.15, random_state=666)

train_motifs = motifs[X_train]
test_motifs = motifs[X_test]

## Look at CD-HIT results.

In [None]:
from Bio import SeqIO

results = '1588970924.result/1588970924.fas.db2novel.clstr.sorted'

fasta_sequences = SeqIO.parse(open(results),'fasta')
for i,fasta in enumerate(fasta_sequences):
    name, sequence = fasta.id, str(fasta.seq)
    print(i,sequence.split())
    if i==1000:
        break

In [None]:
m1 = train_motifs[2268]
m2 = test_motifs[128]

print(m1, m2, m1[7], m2[7])

i1 = np.where(motifs==m1)[0][0]
i2 = np.where(motifs==m2)[0][0]

print(i1,i2)

fi1 = np.where(motifxFamMatrix[i1]==1)[0]
fi2 = np.where(motifxFamMatrix[i2]==1)[0]
print(fi1)
print(fi2)

print(motifxFamMatrix[i1])
print(motifxFamMatrix[i2])

print(fams[fi1])
print(fams[fi2])

## Find Hamming-near motifs between test and train sets.

In [5]:
### https://biology.stackexchange.com/questions/23523/hamming-distance-between-two-dna-strings
def inv_hamming_distance_COPYPASTA(s1, s2):
    #Return the Hamming distance between equal-length sequences
    if len(s1) != len(s2):
        raise ValueError("Undefined for sequences of unequal length")
    return sum(ch1 == ch2 for ch1, ch2 in zip(s1, s2)) / len(s1)

In [6]:
def inv_hamming_distance(s1, s2):
    #Return the Hamming distance between equal-length sequences
    if len(s1) != len(s2):
        raise ValueError("Undefined for sequences of unequal length")
    zipped = zip(s1,s2)
    ### don't want to count "empty" characters ...
    max_buff_length = max( sum([1 for c in s1 if c=='_']), sum([1 for c in s2 if c=='_'])  )
    return (sum((ch1 == ch2 and ch1 != '_' and ch2 != '_') for ch1, ch2 in zip(s1, s2)) / 
            (len(s2)-max_buff_length) )

In [7]:
t1 = 'GEDEESESD______'
t2 = 'AEEKEAKSD______'

print(inv_hamming_distance(t1,t2))
print(inv_hamming_distance_COPYPASTA(t1,t2))

print(inv_hamming_distance_COPYPASTA('GEDEESESD','AEEKEAKSD'))
print(inv_hamming_distance('GEDEESESD','AEEKEAKSD'))

0.4444444444444444
0.6666666666666666
0.4444444444444444
0.4444444444444444


In [8]:
import time
import itertools

start = time.time()
similar_motifs = []
for i,combo in enumerate(itertools.product(train_motifs, test_motifs)):
    score = inv_hamming_distance(combo[0],combo[1])
    if score >= 0.6:
        print(score, combo)
        similar_motifs.append( [combo[0], combo[1], score] )
print("%5.3f mins" % ((time.time()-start)/60))
allDone()


0.6 ('SPGAPGGSGSQPNQK', 'VPEAPGGSAV_____')
0.6 ('KEVEDKESEGEEEDE', 'KAARVLGSEGEEEDE')
0.6 ('PSWLRRASAPLPGLS', 'ASPARRASAILPGVL')
0.8 ('SAAAAIVSPEEELDG', 'PAADAIMSPEEELDG')
0.8666666666666667 ('GETLPDSTPLGFYLR', 'GETLPDSTPLGLYLK')
0.6666666666666666 ('______MSSKKAKTK', 'MIFDPTMSKKKKKKK')
0.6666666666666666 ('______MSSKKAKTK', '___MSSESSKKRKPK')
0.6666666666666666 ('GIPIRVYTHEVVTLW', 'GSPNRAYTHQVVTRW')
0.6 ('_____MSSPPPARSG', 'PSEPMMSTPPPASEL')
0.6666666666666666 ('SRRSRSRSRSRSPGR', 'SPRRRRRSRSRSRSR')
0.6 ('AVEEDAESEDEEEED', 'DQPEDAGSEDELEEG')
0.9333333333333333 ('ESSPLPTSPKFLRPN', 'ESSPLPTSPKFLRQN')
0.8666666666666667 ('LKKSLRQSFRRMRRS', 'LKKSLRQSFRRIRKS')
0.8 ('DTRQIQPSPPWSYDQ', 'DPRQAQSSPPWSYDQ')
0.6 ('HTPKTADSQETKESQ', 'HTPKTDDSQEKTDDS')
0.8666666666666667 ('ERVSRKMSIQEYELI', 'ERFSRKMSVQEYELI')
0.8 ('RNSPVTKTPPRDLPT', 'RNSPVAKTPPKDLPA')
0.8 ('ERPYRRESEI_____', 'PRPLRRESEI_____')
0.6 ('KRTADSSSSEDEEEY', 'DLTIDSSSDEEEEEP')
0.6 ('RGRGRGYYQGGGGRY', 'RGRGRGGSIRGRGRG')
0.6666666666666666 (

0.9333333333333333 ('PEETNNDYETADGGY', 'LEETNNDYETADGGY')
0.6 ('EDAFPPSSPLFAEPY', 'EPGTPPSSPLSAEQL')
0.6 ('RSRSRSRSRSPGRPA', 'RSSSRERSRSRGSKS')
0.6 ('WNPFPDFTPQKFKEK', 'WHQTPDFTPTKYKER')
0.6 ('RRRSYSRSRSHSRSR', 'SPRRRRRSRSRSRSR')
0.6666666666666666 ('RKKKPYGTISHGVVE', 'LKKKQYTSIHHGVVE')
0.6 ('STETRSSSSESSHSS', 'ETRTRTSSSCSSYSY')
0.9333333333333333 ('GGLIEPDTPGRVPLD', 'GGLIEPDTPGRVSLD')
0.8666666666666667 ('RDFNSYGSRRGNDAI', 'REFNSYGSRRGNDAV')
0.9333333333333333 ('AMNREVSSLKSKLRR', 'AMNREVSSLKNKLRR')
0.8 ('MQVVRKTTSPEGEVV', 'IQVVKKTTTPEGEVV')
0.8 ('GSGEEPATPSRKILD', 'GQGEKSATPSRKILD')
0.6 ('GVPVRTFTHEVVTLW', 'GSPNRAYTHQVVTRW')
0.9333333333333333 ('REFNSYGSRRGNDAI', 'REFNSYGSRRGNDAV')
0.8666666666666667 ('KPFPAPQTPGRLQPA', 'KPFPAPQTPGRLQAL')
0.6 ('APDGLLASPDLGLLK', 'KNSDLLTSPDVGLLK')
0.6 ('PGVVMASSPALPTQP', 'PGVGTADSPAAPTDS')
0.9333333333333333 ('GVRRRRLSNVSLTGL', 'GVRRRRLSNVSLTGV')
0.6 ('QNEFAGFSYTNPEFV', 'QSEFEGFSFVNSEFL')
0.8 ('PRHSIYSSDEDDEDI', 'PRHSIYSSDDDEEDV')
0.6 ('PEGSPSKSPSKKKK

In [9]:
sim_train = list(set([x[0] for x in similar_motifs]))
sim_test = list(set([x[1] for x in similar_motifs]))

print(len(sim_train))
print(len(sim_test))

271
218


In [10]:
MY_IDX = 16

t1 = similar_motifs[MY_IDX][0]
t2 = similar_motifs[MY_IDX][1]

i1 = np.where(motifs==t1)[0][0]
i2 = np.where(motifs==t2)[0][0]

print(i1,i2)

fi1 = np.where(motifxFamMatrix[i1]==1)[0]
fi2 = np.where(motifxFamMatrix[i2]==1)[0]
print(fi1)
print(fi2)

print(motifxFamMatrix[i1])
print(motifxFamMatrix[i2])

print(fams[fi1])

print(fams[fi2])

5256 6192
[2 3 7]
[7]
[0 0 1 1 0 0 0 1]
[0 0 0 0 0 0 0 1]
['CDK' 'MAPK' 'PIKK']
['PIKK']


## Define and save training set indices to remove ... 

In [12]:
idc_to_remove = []
for motif in sim_train:
    idx = np.where(motifs==motif)[0][0]
    idc_to_remove.append(idx)
idc_to_remove = np.array((idc_to_remove))

In [13]:
idc_to_keep = [x for x in X_train if x not in idc_to_remove]
X_train = idc_to_keep

train_motifs = motifs[X_train]
test_motifs = motifs[X_test]

train_motifxFamMatrix = motifxFamMatrix[X_train]
test_motifxFamMatrix = motifxFamMatrix[X_test]

data_dir = "data__for_nerds/"

df = pd.DataFrame(train_motifs,dtype='U')
df.to_csv(data_dir + 'train_motifs.csv',header=None,index=None)
df = pd.DataFrame(test_motifs,dtype='U')
df.to_csv(data_dir + 'test_motifs.csv',header=None,index=None)

df = pd.DataFrame(train_motifxFamMatrix,dtype=int)
df.to_csv(data_dir + 'train_motifxFamMatrix.csv',header=None,index=None)
df = pd.DataFrame(test_motifxFamMatrix,dtype=int)
df.to_csv(data_dir + 'test_motifxFamMatrix.csv',header=None,index=None)