In [None]:
from sklearn.cluster import DBSCAN
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import os
from tqdm import tqdm

In [None]:
df_result = pd.read_csv('after_t-sne.csv')

In [None]:
def label_dbscan(embeded_param):
    db = DBSCAN(eps = 1.5, min_samples = 30)
    labels= db.fit_predict(embeded_param)
    return labels

In [None]:
# eps =1.0, min_samples = 30

# aa properties
data = df_result[["tsne_x_axis_volume_hydro","tsne_y_axis_volume_hydro"]].values.tolist()

aa_properties_labels = label_dbscan(data,1.0, 30)
df_result['aa_properties_dbscan'] = aa_properties_labels

# one_hot
data = df_result[['tsne_x_axis_seq_datas','tsne_y_axis_seq_datas']].values.tolist()

one_hot_labels = label_dbscan(data,1.0, 30)
df_result['one_hot_dbscan'] = one_hot_labels

# k-mer
data = df_result[['tsne_x_axis_data_pattern','tsne_y_axis_data_pattern']].values.tolist()

k_mer_labels = label_dbscan(data,1.0,30)
df_result['kmer_dbscan'] = k_mer_labels

In [None]:
# save data
# df_result = pd.read_csv('df_result.csv', index_col = 0, keep_default_na=False) to read again

df_result.to_csv('DB_scanned.csv')

In [None]:
#Fig.3

# function to relabeling、other = Ot, N/A = NA, and others take three letters from first
def rename_species_for_label(species_name: str):
    if species_name == 'other': 
        rename = 'Ot'
    elif species_name == 'N/A':
        rename = 'NA'
    else:
        rename = species_name[0:2]
    return rename

In [None]:
# extract the mode
def plot_text(df, DB_scan_label, x_axis, y_axis):
    major_species_name = df['species_name'].mode()[0] # use mode function
    percent = df['species_name'].to_list().count(major_species_name)*100/len(df) # calculate percentatge
    center_x = df[x_axis].mean() # calculate average
    center_y = df[y_axis].mean()
    return [f'Cluster {DB_scan_label}\n{rename_species_for_label(major_species_name)}: {round(percent, 0)}%', center_x, center_y]

In [None]:
def plot_DBscan(df, DB_scan_label, n_threshhold, name, x_axis, y_axis):
    color_number = 0
    plt.figure(figsize = (12, 8))
    plt.xlim(-70, 70)
    plt.ylim(-70, 70)
    plt.xticks(fontsize = 20)
    plt.yticks(fontsize = 20)
    plt.xlabel('t-SNE x axis', fontsize = 20, fontweight = 'bold')
    plt.ylabel('t-SNE y axis', fontsize = 20, fontweight = 'bold')
    plt.legend(fontsize = 12, markerscale = 5)
    color_datasets = [color for color in (matplotlib.colors.XKCD_COLORS)]
    
    n_size_of_each_DB_label = df.groupby(DB_scan_label).size()
    list_of_DB_label_above_threshold = n_size_of_each_DB_label[n_size_of_each_DB_label > n_threshhold]
    
    for i, label in enumerate(list_of_DB_label_above_threshold[1:].index):
        df_ = df[df[DB_scan_label] == label]
        text_data = plot_text(df_, i + 1, x_axis, y_axis)
        
        plt.scatter(df_[x_axis], df_[y_axis], s = 1, color = color_datasets[color_number])
        plt.text(
                int(text_data[1]),
                int(text_data[2]),
                str(text_data[0]),
                fontsize = 9
                )
        color_number += 1
    
    os.makedirs(os.path.join('Results', 'Fig_3'), exist_ok = True)
    print(fr'{color_number} clusters were found')
    plt.savefig(os.path.join('Results', 'Fig_3', name), dpi = 300, bbox_inches = 'tight')

In [None]:
plot_DBscan(df_result, 'aa_properties_dbscan', 100,'Fig_3_DBscan_with_species_name_aa.png','tsne_x_axis_volume_hydro','tsne_y_axis_volume_hydro')

plot_DBscan(df_result,'one_hot_dbscan', 100,'Fig_3_DBscan_with_species_name_one_hot.png','tsne_x_axis_seq_datas','tsne_y_axis_seq_datas')

plot_DBscan(df_result,'kmer_dbscan', 100,'Fig_3_DBscan_with_species_name_kmer.png','tsne_x_axis_data_pattern','tsne_y_axis_data_pattern')