## Import libraries

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import gensim.downloader as api
import pandas as pd
import numpy as np
import gensim

In [None]:
!pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!pip install transformers==4.34.0 scikit-learn==1.3.1 gdown==4.7.1


In [None]:
!pip install --upgrade huggingface_hub


In [None]:
from transformers import AutoTokenizer, AutoModel

# Load ClinicalBERT from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")





In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

## Load data

In [None]:
dataset = pd.read_csv('triage.csv')

# Tokenize function

In [None]:
def preprocess(text):
    inputs = tokenizer(text, return_tensors="pt")
    return inputs

## Preprocess function

In [None]:
def get_vector(complaint):
    inputs = preprocess(complaint)
    outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    cls_embedding = cls_embedding.detach().numpy()
    return cls_embedding


## Preprocess data

In [None]:
dataset

In [None]:
X = dataset.iloc[:, [0,1,10]]

In [None]:
cls_embeddings = [get_vector(complaint) for complaint in X.iloc[:,2]]

In [None]:
cls_embeddings = np.vstack(cls_embeddings)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Standardization
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cls_embeddings)

pca = PCA()
pca.fit(scaled_data)

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
         np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot')
plt.axhline(y=0.95, color='r', linestyle='--')
plt.show()

In [None]:
pca = PCA(n_components=70)  # e.g., 50
reduced_vectors = pca.fit_transform(scaled_data)

In [None]:
X = X.join(pd.DataFrame(reduced_vectors))

In [None]:
print(X)

## Using the elbow method to find the optimal number of clusters

In [None]:
from sklearn.neighbors import NearestNeighbors
import plotly.express as px

neighbors = 40
# X_embedded is your data
nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(reduced_vectors)
distances, indices = nbrs.kneighbors(reduced_vectors)
distance_desc = sorted(distances[:,neighbors-1], reverse=True)
px.line(x=list(range(1,len(distance_desc )+1)),y= distance_desc )

In [None]:
#!pip install kneed
from kneed import KneeLocator
kneedle = KneeLocator(range(1,len(distance_desc)+1),  #x values
                      distance_desc, # y values
                      S=0.1, #parameter suggested from paper
                      curve="convex", #parameter from figure
                      direction="decreasing",
                      online=False,
                      interp_method='polynomial',
                      polynomial_degree=15) #parameter from figure

In [None]:
kneedle.plot_knee_normalized()

In [None]:
print(kneedle.norm_knee)

## Training the DBSCAN model on the dataset

In [None]:
dbscan = DBSCAN(eps=kneedle.norm_knee*80, min_samples=4)
y_dbscan= dbscan.fit_predict(reduced_vectors, y=None, sample_weight=None)

## Visualising the clusters in 2D

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Assuming X is a pandas DataFrame, convert it to a NumPy array
#X_dense = X.values

# Reduce dimensions (here using PCA for demonstration; consider t-SNE or MDS for better handling of categorical variables)
pca = PCA(n_components=2)
vectors_pca = pca.fit_transform(reduced_vectors) # Pass the dense array here

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
plt.scatter(vectors_pca[:, 0], vectors_pca[:, 1], c=y_dbscan, cmap='viridis', label='Cluster ID')
plt.colorbar(ticks=range(25), label='Cluster ID')  # Adjust the range according to your number of clusters
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Cluster Plot with K-Prototypes Clustering')
plt.show()

## Visualising the clusters in 3D

In [None]:
# Apply PCA to reduce dimensions to three
pca = PCA(n_components=3)
vectors_pca = pca.fit_transform(reduced_vectors)

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Create a 3D plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot using the first three PCA components
scatter = ax.scatter(vectors_pca[:, 0], vectors_pca[:, 1], vectors_pca[:, 2], c=y_dbscan, cmap='viridis', marker='o')

# Create a color bar
colorbar = fig.colorbar(scatter, ax=ax, ticks=range(13))  # Adjust range for your number of clusters
colorbar.set_label('Cluster ID')

ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('3D Cluster Plot with K-Prototypes Clustering')

plt.show()

## Extracting what is in each cluster

In [None]:
clustered_data = X.join(pd.DataFrame(y_dbscan, columns=['Cluster']))

In [None]:
clustered_data

In [None]:
clustered_data.to_csv('BERT_MIMIC_whole_clustered_dbscan_eps*80_4.csv', index=False)