<a href="https://colab.research.google.com/github/kxk302/Covid_Clustering/blob/batch/Covid_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!ls '/content/gdrive/MyDrive/Colab Notebooks/Clustering/'

batch_data  batch_results  boston_data	boston_results	uk_data  uk_results


In [5]:
import ast
import bokeh.models as bmo
import numpy as np
import os
import pandas as pd

from bokeh.palettes import d3
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from matplotlib import pyplot as plt
from scipy.stats import entropy
from scipy.stats import gaussian_kde
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn import metrics

pd.set_option('display.max_columns', None)

# When calculating the distance between 2 probability densities,
# if one probability value is 0 (or very small), the cross entropy
# (distance) value would be infinity. This brakes the DBSCAN algorithm.
# Replace infinity values with a large number, say 200.00  
max_distance = 200.00

# x axis values for calculating/plotting KDE of sample AF
x_idx = np.linspace(0.00, 1.00, num=100).tolist()

def get_kde_values(row):
  return gaussian_kde(row['AF']).evaluate(x_idx).tolist()

def get_kl_div(x, y):
  return entropy(x, y)

def preprocess(file_name, sep="\t", num_samples_per_batch=100):

  # Read the input file. Select only the needed columns.
  df = pd.read_csv(file_name, sep)[['Batch', 'AF', 'Sample']]
  df_in = df.copy()

  # Batch stats
  print('\n')
  print('Number of unique batches: {}'.format(df_in['Batch'].nunique()))
  print('Batch minimum: {}'.format(df_in['Batch'].min()))
  print('Batch maximum: {}'.format(df_in['Batch'].max()))

  # af stats
  print('\n')
  print('Number of unique af {}'.format(df_in['AF'].nunique()))
  print('af minimum: {}'.format(df_in['AF'].min()))
  print('af maximum: {}'.format(df_in['AF'].max()))

  # Clean up data by removing rows where af is greater than 1.0
  print('\n')
  print('Removing rows with AF greater than 1.0')
  df_af = df_in[ df_in.AF <= 1.00 ][['Batch', 'AF']]

  # Clean up data by removing batches that have fewer than num_samples_per_batch samples
  df_in_no_dup = df_in.drop_duplicates()
  count = df_in_no_dup[['Batch', 'Sample']].groupby('Batch').count()
  count = count[ count['Sample'] >= num_samples_per_batch]
  print('count.shape: {}'.format(count.shape))
  print('df-af.shape: {}'.format(df_af.shape))
  df_af = df_af[ df_af['Batch'].isin(count.index.values) ]
  print('df_af.shape: {}'.format(df_af.shape))

  # Pivot the data frame
  df_piv = pd.pivot_table(df_af, index='Batch', values='AF', aggfunc=list)
  print('df_piv.head(5)')
  print(df_piv.head(5))

  # Clean up data by removing rows where af list has only one or two element
  # KDE calculation errors out for those
  df_piv_clean = df_piv[ df_piv.AF.str.len() > 2]

  # Calculate 
  df_piv_clean['KDE_vals'] = df_piv_clean.apply(get_kde_values, axis=1)

  print('df_piv_clean.head(5)')
  print(df_piv_clean.head(5))

  return df_piv_clean

# eps: 
#   The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is the most 
#   important DBSCAN parameter to choose appropriately for your data set and distance function.
# min_samples: 
#   The number of samples n a neighborhood for a point to be considered as a core point. This includes the point itself.
# metric: 
#   The metric to use when calculating distance between instances in a feature array. 
# metric_params: 
#  Additional keyword arguments for the metric function.

def dbscan_clustering(file_name, sep='\t', eps=0.5, min_samples=5, metric='euclidean', metric_params=None, distances_file_name=None, n_jobs=1, num_samples_per_batch=100):
  df_piv_clean = preprocess(file_name, sep, num_samples_per_batch)

  if metric == 'precomputed':
    distances = pd.read_csv(distances_file_name, sep=sep, index_col=0)

    # Replace infinity values in distances matric with a large value
    distances.replace([np.inf], max_distance, inplace=True)

    # Run DBSCAN clustering algorithm on precomputed distance matric
    db=DBSCAN(eps=eps, min_samples=min_samples, metric=metric, metric_params=metric_params, n_jobs=n_jobs).fit(distances) 
  else:
    # Run DBSCAN clustering algorithm
    db=DBSCAN(eps=eps, min_samples=min_samples, metric=metric, metric_params=metric_params, n_jobs=n_jobs).fit(df_piv_clean.KDE_vals.tolist())

  labels = db.labels_

  # Number of clusters in labels, ignoring noise if present.
  n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  n_noise_ = list(labels).count(-1)

  print('\n')
  print('Number of clusters: {}'.format(n_clusters_))
  print('Cluster labels: {}'.format(set(labels)))
  print('Number of noise samples: {}'.format(n_noise_))

  # Add Labels (and its string version) to the dataframe
  df_piv_clean['Labels'] = labels

  print('df_piv_clean.head(5)')
  print(df_piv_clean.head(5))

  return df_piv_clean

def get_distance_matrix(df_in):
  if df_in is None or df_in.shape[0] == 0:
    return df_in

  df = df_in.copy()

  row_count = df.shape[0]
  distances = np.zeros((row_count, row_count))

  for idx1 in range(row_count-1):
    for idx2 in range(idx1+1, row_count):
      distances[idx1][idx2] = entropy(df.iloc[idx1]['KDE_vals'], df.iloc[idx2]['KDE_vals'])
      distances[idx2][idx1] = distances[idx1][idx2]
  
  df_out = pd.DataFrame(distances)
  df_out.fillna(0.00, inplace=True)
  distances_sum = df_out.apply(np.sum)
  argmin = distances_sum.argmin()
  return df_out, df.iloc[argmin]

def plot_clusters(df_in, folder):
  if df_in is None or df_in.shape[0] == 0:
    return df_in

  df = df_in.copy()

  num_labels = df['Labels'].nunique()
  print('num_labels: {}'.format(num_labels))

  labels = df['Labels'].unique()
  print('labels: {}'.format(labels))

  fig, axs = plt.subplots(num_labels, 2, gridspec_kw={'hspace': 1.0, 'wspace': 0.5}, figsize=(15, 15))

  # Use num_labels - 1 in range, as we handle noise (-1) separately
  for label in labels:
    print('Label processed: {}'.format(label))

    # idx used in plot axes
    idx = 0
    if label != -1:
      idx = label
    else:
      idx = num_labels - 1

    df_lbl = df[ df.Labels == label ]
    
    distances, cluster_center = get_distance_matrix(df_lbl)
    print('Cluster center for label ' + str(label))
    print(cluster_center)
    
    # Histogram
    xh = cluster_center[0]
    axs[idx][0].hist(xh, density=True)
    axs[idx][0].title.set_text('Cluster ' + str(label) + ' (size ' + str(df_lbl.shape[0]) + ') AF histogram')

    # KDE 
    xk = x_idx
    yk = cluster_center[1]
    axs[idx][1].plot(xk, yk)
    axs[idx][1].title.set_text('Cluster ' + str(label) + ' (size ' + str(df_lbl.shape[0]) + ') AF density estimate')
    
  plt.savefig(folder + '/dbscan_' + str(num_labels) + '.png')
  # plt.show()

def get_cluster_batches(df_in, sep, folder, file_name):
  if df_in is None or df_in.shape[0] == 0:
    return df_in
 
  df = df_in.copy()

  # Read the input file. Select only the needed columns.
  df_in = pd.read_csv(file_name, sep)[['Batch', 'Sample']]
  df_in_no_dup = df_in.drop_duplicates()
  df_in_no_dup_copy = df_in_no_dup.copy()

  labels = df['Labels'].unique()
  print('labels: {}'.format(labels))

  for label in labels:
    df_lbl = df[ df.Labels == label ]
    df_lbl.to_csv(folder + '/cluster_' + str(label) + '.tsv', sep='\t')

    # Get batch samples and save them
    batch_samples = df_in_no_dup_copy[df_in_no_dup_copy['Batch'].isin(df_lbl.index.values)]
    batch_samples.to_csv(folder + '/cluster_' + str(label) + '_samples.tsv', sep='\t')


In [12]:
# Run DBSCAN clustering algorithm on batch dataset
#
# Using 0.0075 for esp (epsilon) to yield ? clusters
# Using default value of 5 for min_samples 
# Using get_kl_div method for metric. get_kl_div() calculates Kullback-Leibler divergence, 
#     which measures the distance between 2 proabaility distributions
# Using None for metric_params, as the metric has no parameters
#

# 1. Cleaned the data (Removed rows with AF > 1.0)
# 2. Pivoted the data so all AFs of a batch are listed on one line
# 3. Calculated Kernel Density Estimates (KDE) of AFs of each batch
#      Evaluated them on 100 data points in range of 0.0 to 1.0
# 4. Ran DBSCAN clustering algorithm
#      epsilon: 0.0075
#      Used Kullback-Liebler (KL) div. to calculate distance between density estimate
#      metric: 'precomputed'. See note below
# 5. DBSCAN produced ? clusters
#      Data points not assigned to any cluster marked as Noise (or cluster -1)
# 6. For each cluster, found a representative batch
#      Calculated KL div. between every pair of batches in a cluster
#      Selected batch with the smallest sum of distances


# Calculated the distance matrix. We run the code below just once, and save the 
# distance matrix to file. We pass the distance matrix file to DBSCAN. that way 
# if we modify DBSCAN parameters (say, eps or num_samples), we avoid calculating 
# the distance matrix repeatedly. Must set metric to 'precomputed'
'''
df = preprocess('/content/gdrive/MyDrive/Colab Notebooks/Clustering/batch_data/batch.tsv', sep='\t')
distances, _ = get_distance_matrix(df)
distances.to_csv('/content/gdrive/MyDrive/Colab Notebooks/Clustering/batch_data/distances_batch.tsv', sep='\t')
'''
def dbscan_clustering_wrapper(eps=0.0085, 
                              min_samples=7,
                              path = '/content/gdrive/MyDrive/Colab Notebooks/Clustering/',
                              data_file='batch.tsv',
                              sep='\t',
                              data_folder = 'batch_data',
                              results_folder = 'batch_results',
                              metric='precomputed',
                              num_samples_per_batch=100):
  folder = str(min_samples) + '_' + str(eps)
  print('folder: {}'.format(folder))
  full_path = os.path.join(path, results_folder, folder)
  print('full_path: {}'.format(full_path))
  os.mkdir(full_path)

  full_data_folder = os.path.join(path, data_folder)
  full_results_folder = os.path.join(path, results_folder, folder)

  df = dbscan_clustering(file_name=full_data_folder+'/'+data_file,
                         sep=sep, 
                         eps=eps, 
                         min_samples=min_samples, 
                         #metric=get_kl_div,
                         metric=metric,
                         metric_params=None,
                         distances_file_name=full_data_folder+'/distances_'+data_file,
                         num_samples_per_batch=num_samples_per_batch)

  df.to_csv(full_results_folder + '/all_clusters_eps_' + str(eps) + '_min_samples_' + str(min_samples) + '.tsv', sep=sep)
  plot_clusters(df, folder=full_results_folder)
  get_cluster_batches(df_in=df, sep=sep, folder=full_results_folder, file_name=full_data_folder+'/'+data_file)


In [None]:
# Run DBSCAN algorithm on 'data_file' in 'data_folder' and save the results to 'results_folder'. 
for min_samples in [2, 3, 4, 5, 6, 7]:
  for eps in [0.0200, 0.0225, 0.0250, 0.0275, 0.0300]:
    print('\n\n\nRunning DBSCAN for min_samples: {}, eps: {}'.format(min_samples, eps))
    dbscan_clustering_wrapper(eps=eps, 
                              min_samples=min_samples,
                              path = '/content/gdrive/MyDrive/Colab Notebooks/Clustering/',
                              data_file='batch.tsv',
                              sep='\t',
                              data_folder = 'batch_data',
                              results_folder = 'batch_results',
                              metric='precomputed',
                              num_samples_per_batch=100)

In [None]:
num_samples_per_batch# Histogram for number of samples in a batch

# Read the input file. Select only the needed columns.
df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Clustering/batch_data/batch.tsv', sep='\t')[['Batch','Sample']]
df_no_dup = df.drop_duplicates()
print(df_no_dup.head())
print(df_no_dup.shape)
print(df_no_dup[df_no_dup.Batch=='b35b2b63194dbf47'])
print(df_no_dup[df_no_dup.Batch=='b35b2b63194dbf47'].shape)

count = df_no_dup.groupby('Batch').count()
print(type(count))
print(count.head())
print(count['Sample'].values)
plt.hist(count['Sample'].values, bins=15)
'''
estonia_samples = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Clustering/batch_data/Estonia_samples.csv', sep=',', names=['Sample'])
print(estonia_samples.head())
print(estonia_samples.shape)

estonia_batch = pd.merge(df_no_dup, estonia_samples, on='Sample', how='inner')
print(estonia_batch.head())
print(estonia_batch.shape)

count = estonia_batch.groupby('Batch').count()
print(count)
print(count['Sample'].values)
plt.hist(count['Sample'].values, bins=20)
'''


