In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib
import os
import time

In [None]:
# import pairwised data
df_result = pd.read_csv('Pairwised_sequences.csv', keep_default_na=False)
df_result = df_result[df_result['pairwised_sequence'].str.startswith('M')] # extract the sequences starts from Methionine
df_result = df_result[df_result['pairwised_sequence'].str.len() == 1273] # extract the sequences whose length is 1273

In [None]:
 ## prepare for creating the dataset for each approach ##

# put each sequence to the list
pairwised_seq = df_result["pairwised_sequence"]

## amino acid parameters ##

    # Amino acid property [amino acid volume, amino acid hydrophilicity].
    # Both of these are relative values.

aa_properties_dict = {
    'A':[-2.90, -1.03], 'R':[2.41, 1.31], 'N':[-0.68, 0.79],
    'D':[-0.92, 1.23], 'C':[-1.89, 0.15], 'Q':[0.36, 1.09],
    'E':[0.16, 1.28], 'G':[-4.04, 0.01], 'H':[0.83, 1.15],
    'I':[0.51, -1.32], 'L':[0.52, -1.40], 'K':[0.92, 1.23],
    'M':[0.92, -1.42], 'F':[2.22, -1.47], 'P':[-1.25, -0.64],
    'S':[-2.36, 0.38], 'T':[-1.19, 0.28], 'W':[4.28, -0.18],
    'Y':[2.75, -0.18], 'V':[-0.65, -1.27], '-':[0, 0]
}

In [None]:
## dataset for amino acid properties ##

amino_acid_properties = [[aa_properties_dict[i] for i in seq] for seq in pairwised_seq]

pairwised_volume = [[item[0] for item in sublist] for sublist in amino_acid_properties]

pairwised_hydrophilicity = [[item[1] for item in sublist] for sublist in amino_acid_properties]

amino_acid_volume_hydrophilicity = [sum([volume, hydro], []) for volume, hydro in zip(pairwised_volume, pairwised_hydrophilicity)]

In [None]:
## dataset for one-hot encoding ##

one_hot_vec_dict={}

for i,key in enumerate(aa_properties_dict):
    one_hot_vec=np.zeros(len(aa_properties_dict)) # all values are set to zero
    one_hot_vec[i]=1 # change the corresponding values to 1
    one_hot_vec_dict[key]=one_hot_vec # this obatained  vector is inserted into empty dictionary

# create data with this dictionary

one_hot_datas = []
for seq in pairwised_seq:
    seq_data=[]
    
    for lit in seq:
        seq_data.append(one_hot_vec_dict[lit]) # assign the parameters of one-hot encoding
    
    one_hot_array = np.array(seq_data) # change the style of array in list to array only
    one_hot_datas.append(one_hot_array.reshape(21*len(one_hot_array)))

In [None]:
## dataset for k-mer based approach ##

# k=3, create the all pattern of three amino acid sequence

pattern_size = 21*21*21
seq_size = len(pairwised_seq[0])
kmer_size = seq_size - 3 + 1

pattern_aminoseq = []
for key in aa_properties_dict:
    pattern = []
    pattern.append(key)
    for keys in aa_properties_dict:
        pattern.append(keys)
        for ke in aa_properties_dict:
            pattern.append(ke)
            pattern_aminoseq.append(key+keys+ke)

# dictionalize, assign the ID to eazh sequence

amino_pattern_dict={}

for i in range(pattern_size):
    amino_pattern_dict[str(pattern_aminoseq[i])]=i

# k-mer_data
data_pattern=[]
for seq in pairwised_seq:
    zeroseq=np.zeros(pattern_size) # create the array of 0, whose length is equal to the sequence length
    
    for j in range(len(seq)-3):
        zeroseq[amino_pattern_dict[str(seq[j:3+j])]]+=1 # when one sequence appear, add 1 to the corresponded 'zeroseq' at the index 
        
    data_pattern.append(zeroseq/kmer_size) # calculate the appearance rate

In [None]:
## t-sne ##
# along with the tSNE, calculate the runtime

tsne = TSNE(n_components = 2, random_state = 0, perplexity = 30, n_iter = 1000)

# all apps and softwares were quited except for mi、anaconda and juyter notebook in google chrome以外は終了して計測した。

# aa

# start measurement of runtime
time_sta_aa = time.perf_counter()
# t-sne start
amino_acid_volume_hydro_tsne = tsne.fit_transform(amino_acid_volume_hydrophilicity)
# finish measurement of runtime
time_end_aa = time.perf_counter()
# calculate the time (s)
tim_aa = time_end_aa - time_sta_aa

df_result['tsne_x_axis_volume_hydro'] = amino_acid_volume_hydro_tsne[:, 0]
df_result['tsne_y_axis_volume_hydro'] = amino_acid_volume_hydro_tsne[:, 1]

# one hot

# start measurement of runtime
time_sta_one_hot = time.perf_counter()
# t-sne start
seq_datas_tsne = tsne.fit_transform(one_hot_datas)
# finish measurement of runtime
time_end_one_hot = time.perf_counter()
# calculate the time (s)
tim_one_hot = time_end_one_hot - time_sta_one_hot

df_result['tsne_x_axis_seq_datas'] = seq_datas_tsne[:, 0]
df_result['tsne_y_axis_seq_datas'] = seq_datas_tsne[:, 1]

# k-mer

# start measurement of runtime
time_sta_kmer = time.perf_counter()
# t-sne start
data_pattern_tsne = tsne.fit_transform(data_pattern)
# inish measurement of runtime
time_end_kmer = time.perf_counter()
# calculate the time (s)
tim_kmer = time_end_kmer - time_sta_kmer

df_result['tsne_x_axis_data_pattern'] = data_pattern_tsne[:, 0]
df_result['tsne_y_axis_data_pattern'] = data_pattern_tsne[:, 1]

In [None]:
## Fig.2, plot the clusterig by t-SNE ##

# create the function which sort by the order of Greek letter
def sort_by_greek_alphabet(x):
    greek_order = ['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon', 'Eta', 'Iota', 'Kappa', 'Lambda', 'N/A', 'Omicron', 'Zeta', 'Mu', 'other']
    return sorted(x, key=greek_order.index)


# funtion to create the plot for each variant
def create_scatter_plot_by_species(df, x_data, y_data):
    
    plt.figure(figsize = (12, 8))
    color_datasets = plt.cm.get_cmap("tab10").colors + plt.cm.get_cmap("Dark2").colors
    
    for rgb, labels in zip(color_datasets, sort_by_greek_alphabet(df['species_name'].unique())):
        data = df[df['species_name'] == labels]
        plt.scatter(data[x_data], data[y_data], label = labels, color = rgb, s = .75)
        
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    plt.xlabel('t-SNE x axis', fontsize = 20, fontweight = 'bold')
    plt.ylabel('t-SNE y axis', fontsize = 20, fontweight = 'bold')
    plt.legend(fontsize = 16, markerscale = 7, bbox_to_anchor=(1.01, 1))


os.makedirs(os.path.join('Results', 'Fig_2'), exist_ok=True)

# amino acid properties
create_scatter_plot_by_species(df_result, 'tsne_x_axis_volume_hydro', 'tsne_y_axis_volume_hydro')
plt.savefig(os.path.join('Results', 'Fig_2', 'Fig_2_Clustering_with_aa_properties_by_t-SNE.png'), dpi = 300, bbox_inches = 'tight')

# one-hot encoding
create_scatter_plot_by_species(df_result, 'tsne_x_axis_seq_datas', 'tsne_y_axis_seq_datas')
plt.savefig(os.path.join('Results', 'Fig_2', 'Fig_2_Clustering_by_one_hot_t-SNE.png'), dpi = 300, bbox_inches = 'tight')

# k-mer
create_scatter_plot_by_species(df_result, 'tsne_x_axis_data_pattern', 'tsne_y_axis_data_pattern')
plt.savefig(os.path.join('Results', 'Fig_2', 'Fig_2_Clustering_k-mer_by_t-SNE.png'), dpi = 300, bbox_inches = 'tight')

In [None]:
df_result.to_csv('after_t-sne.csv', index = False)