In [None]:
from sklearn.cluster import DBSCAN
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import os

In [None]:
df_result = pd.read_csv('df_result.csv', index_col = 0, keep_default_na=False)

In [None]:
### Extract each variant ###

# function to count the number of each variant
def return_species_names_list_in_DB_cluster(df):
    species_name_list_in_DB_label = df['species_name'].value_counts()
    return species_name_list_in_DB_label

def collect_major_species_from_cluster(df, label_name, n_threshhold = 100):
    
    species_name_in_DB_cluster_pd = pd.DataFrame() # dataframe for binding
    n_size_of_each_DB_label = df.groupby(label_name).size() # extract the size of obtained clusters with index
    list_of_DB_label_above_threshold = n_size_of_each_DB_label[n_size_of_each_DB_label > n_threshhold] # remove the small clusters
    
    for i, label in enumerate(list_of_DB_label_above_threshold[1:].index): # The labeled -1 indicates an outlier, so it starts from the next value
        
        # count the number of each variant exsiting in the labels assigned by dbscan
        species_name_in_DB_cluster = return_species_names_list_in_DB_cluster(df[df[label_name] == label])
        # attach the ID, and convert to series
        species_name_in_DB_cluster_s = pd.Series(species_name_in_DB_cluster, name = f'cluster_{i + 1}')
        # number to ratio
        species_name_in_DB_cluster_s = round(species_name_in_DB_cluster_s/species_name_in_DB_cluster_s.sum(), 3)
        # bind to the empty dataframe
        species_name_in_DB_cluster_pd = pd.concat([species_name_in_DB_cluster_pd, species_name_in_DB_cluster_s], axis = 1)
        
    species_name_in_DB_cluster_pd = species_name_in_DB_cluster_pd.fillna(0) # convert NaN to 0
    major_species_dict = species_name_in_DB_cluster_pd.idxmax().rename('major_species_in_cluster') # extract the name of major variant
   
    # name the size of each dbscan cluster extracted as 'n' and reset the index (outliers are removed)
    # this is because when the small clusters were removed, index also removed.
    species_size_of_major = list_of_DB_label_above_threshold[1:].rename('n').reset_index()
    
    # label of each cluster was converted to the raw. Each raw contains major variant and its size.
    cluster_num_dict = pd.concat([major_species_dict.reset_index(), species_size_of_major], axis = 1)
    
    return species_name_in_DB_cluster_pd, cluster_num_dict

In [None]:
species_name_in_DB_cluster_pd_aa, cluster_num_dict_aa = collect_major_species_from_cluster(df_result, 'aa_properties_dbscan')

species_name_in_DB_cluster_pd_one_hot, cluster_num_dict_one_hot = collect_major_species_from_cluster(df_result, 'one_hot_dbscan')

species_name_in_DB_cluster_pd_kmer, cluster_num_dict_kmer = collect_major_species_from_cluster(df_result, 'kmer_dbscan')

In [None]:
df_result_aa = pd.merge(cluster_num_dict_aa, df_result, on='aa_properties_dbscan')\
    .rename(columns={'index': 'cluster_no'}) # change the column name

df_result_aa.to_csv('df_result_aa.csv')


df_result_one_hot = pd.merge(cluster_num_dict_one_hot, df_result, on='one_hot_dbscan')\
    .rename(columns={'index': 'cluster_no'}) # change the column name

df_result_one_hot.to_csv('df_result_one_hot.csv')


df_result_kmer = pd.merge(cluster_num_dict_kmer, df_result, on='kmer_dbscan')\
    .rename(columns={'index': 'cluster_no'}) # change the column name

df_result_kmer.to_csv('df_result_kmer.csv')

In [None]:
# function to relabeling、other = Ot, N/A = NA, and others take three letters from first
def rename_species_for_label(species_name: str):
    if species_name == 'other': 
        rename = 'Ot'
    elif species_name == 'N/A':
        rename = 'NA'
    else:
        rename = species_name[0:2]
    return rename

In [None]:
def plot_tSNE_with_cluster_no(df, species_name, x_axis, y_axis):
    
    species_name_data = df[df['species_name'] == species_name] # extract the variant matching with specific label
    major_species_name_data = df[df['major_species_in_cluster'] == species_name] # extract the clusters whose highest percentage of variant detemined by DBSCAN matches with 
    # obtained dataframe is connected
    merged_pd = pd.merge(species_name_data, major_species_name_data, on=[x_axis, y_axis], how = "inner")
    
    major_species_name_data_group = major_species_name_data.groupby('cluster_no')
    fig, ax = plt.subplots(figsize=(12, 8))
    major_species_name_data_mean = major_species_name_data_group.mean().reset_index()
    
    for key, grp in major_species_name_data_group:
        x_axis_label_position = major_species_name_data_mean[major_species_name_data_mean["cluster_no"] == key][x_axis]
        y_axis_label_position = major_species_name_data_mean[major_species_name_data_mean["cluster_no"] == key][y_axis]
        ax.scatter(grp[x_axis], grp[y_axis], label = key, s = .5, c = "m", alpha = 1)
        ax.text(x_axis_label_position, y_axis_label_position, f"{key}", fontsize = 14)
    
    ax.scatter(species_name_data[x_axis], species_name_data[y_axis], s = .5, c = "c", alpha = 1)
    ax.scatter(merged_pd[x_axis], merged_pd[y_axis], s = .5, c = "orange", alpha = 1)
    
    
    plt.xlabel('t-SNE x axis', fontsize = 20, fontweight = 'bold')
    plt.ylabel('t-SNE y axis', fontsize = 20, fontweight = 'bold')
    plt.title(species_name, fontsize = 24)
    plt.xlim(-70, 70)
    plt.ylim(-70, 70)
    plt.xticks(fontsize = 18)
    plt.yticks(fontsize = 18)

In [None]:
# Fig. 5

current_dir = os.getcwd()

os.mkdir('each_cluster_aa')
os.chdir('each_cluster_aa')
for i in df_result['species_name'].unique():
    plot_tSNE_with_cluster_no(df_result_aa, i,'tsne_x_axis_volume_hydro', 'tsne_y_axis_volume_hydro')
    plt.savefig(f't_SNE_{rename_species_for_label(i)}_with_cluster_no_aa.png', dpi = 300)
os.chdir(current_dir)

os.mkdir('each_cluster_one_hot')
os.chdir('each_cluster_one_hot')
for i in df_result['species_name'].unique():
    plot_tSNE_with_cluster_no(df_result_one_hot, i, 'tsne_x_axis_seq_datas', 'tsne_y_axis_seq_datas')
    plt.savefig(f't_SNE_{rename_species_for_label(i)}_with_cluster_no_one_hot.png', dpi = 300)
os.chdir(current_dir)

os.mkdir('each_cluster_kmer')
os.chdir('each_cluster_kmer')
for i in df_result['species_name'].unique():
    plot_tSNE_with_cluster_no(df_result_kmer, i, 'tsne_x_axis_data_pattern', 'tsne_y_axis_data_pattern')
    plt.savefig(f't_SNE_{rename_species_for_label(i)}_with_cluster_no_kmer.png', dpi = 300)
os.chdir(current_dir)

In [None]:
species_name_in_DB_cluster_pd_aa.to_csv('Table_S2_species_in_the_DBSNCANed_cluster_aa.csv')
species_name_in_DB_cluster_pd_one_hot.to_csv('Table_S2_species_in_the_DBSNCANed_cluster_one_hot.csv')
species_name_in_DB_cluster_pd_kmer.to_csv('Table_S2_species_in_the_DBSNCANed_cluster_kmer.csv')