<a href="https://colab.research.google.com/github/kxk302/Covid_Clustering/blob/main/Covid_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from scipy.stats import entropy
from scipy.stats import gaussian_kde
from sklearn.cluster import DBSCAN
from sklearn import metrics

def df_stats(df):
  # Data frame stats
  print("\n\n")
  print("Data frame shape: {}".format(df.shape))
  print("Data frame columns: {}".format(df.columns))
  print("Print data frame first 5 rows")
  print(df.head(5))

def get_kde_values(row):
  return gaussian_kde(row['AF']).evaluate(ind)

def get_kl_div(x, y):
  return entropy(x, y)

# Read the input file
df = pd.read_csv("./data/data.tsv", sep="\t", names=["Sample", "Date_1", "Date_2", "UNK_1", "UNK_2", "POS", "REF", "ALT", "EFFECT", "CODON", "TRID", "AA", "AF"])

# Select only the needed columns
df = df[["Sample", "AF"]]

# Add a Value column to be used in pivoting
df["Value"] = 1

df_stats(df)

# Sample stats
print("\n\n")
print("Number of unique samples: {}".format(df["Sample"].nunique()))
print("Sample minimum: {}".format(df["Sample"].min()))
print("Sample maximum: {}".format(df["Sample"].max()))

# af stats
print("\n\n")
print("Number of unique af {}".format(df["AF"].nunique()))
print("af minimum: {}".format(df["AF"].min()))
print("af maximum: {}".format(df["AF"].max()))

# Clean up data by removing rows where af is greater than 1.0
print("\n\n")
print("Rows with AF greater than 1.0")
print(df[df.AF > 1.00])
df = df[df.AF <= 1.00]

df_stats(df)

# Pivot the data frame
df = pd.pivot_table(df, index="Sample", values="AF", aggfunc=list)

df_stats(df)

# Clean up data by removing rows where af list has only one element
print("\n\n")
print("Rows with AF list of size 1")
print(df[ df.AF.str.len() == 1 ])
df = df[ df.AF.str.len() > 1]

df_stats(df)

ind = np.linspace(0.00, 1.00, num=100)
df = df.apply(get_kde_values, axis=1)

print('\n\n')
print('KDE values')
print(df)

print('\n\n')
print("KL Divergence")
print(get_kl_div(df.iloc[0], df.iloc[1]))

# Run DBSCAN clustering algorithm
print("type(df): {}".format(type(df)))
db = DBSCAN(metric=get_kl_div).fit(df.values.tolist())

labels = db.labels_
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)