## Import libraries

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import gensim.downloader as api
import pandas as pd
import numpy as np
import gensim

In [None]:
!pip install kmodes
from kmodes.kprototypes import KPrototypes

## Download NLTK data

In [None]:
nltk.download("punkt")
nltk.download("stopwords")

## Preprocess function

In [None]:
def load_stop_words(file_path):
    with open(file_path, 'r') as file:
        stop_words = set(line.strip() for line in file)
    return stop_words

stop_words_file = 'clinical-stopwords.txt'
custom_stop_words = load_stop_words(stop_words_file)

punctuation = set(string.punctuation)

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in custom_stop_words and t not in punctuation]
    return tokens

## Load data

In [None]:
dataset = pd.read_csv('synthetic_data_2021.csv')

## Preprocess data

In [None]:
dataset['date'] = pd.to_datetime(dataset['date'], format='%m/%d/%y', errors='coerce')
print(dataset['date'])

In [None]:
dataset['date'] = pd.to_datetime(dataset['date'], format='%Y/%m/%d', errors='coerce')
weekly_groups = dataset.groupby(pd.Grouper(key='date', freq='W-FRI'))

In [None]:
for week, data in weekly_groups:
    print(f"Week ending on {week}")
    print(data)

In [None]:
X = dataset.iloc[:, [0,2,3,4]]

In [None]:
processed_data = [preprocess(complaint) for complaint in X]


In [None]:
!wget -O BioWordVec_PubMed_MIMICIII_d200.vec.bin 'https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin'

In [None]:
biowordvec_path = 'BioWordVec_PubMed_MIMICIII_d200.vec.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(biowordvec_path, binary=True)


In [None]:
def get_vector(text, model):
    words = preprocess(text)
    word_vectors = [model[word] for word in words if word in model]
    if word_vectors:
        vector = sum(word_vectors) / len(word_vectors)
        return vector
    else:
        return None

In [None]:
vectors = []
valid_complaints = []
for complaint in X.iloc[:,3]:
    vector = get_vector(complaint, model)
    if vector is not None:
        vectors.append(vector)
        valid_complaints.append(complaint)

In [None]:
print(vectors)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(vectors)

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, len(pca.explained_variance_ratio_) + 1),
         np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot')
plt.axhline(y=0.95, color='r', linestyle='--')
plt.show()

In [None]:
pca = PCA(n_components=130)  # e.g., 50
reduced_vectors = pca.fit_transform(vectors)

In [None]:
X = X.join(pd.DataFrame(reduced_vectors))
#X.to_csv('vectorized.csv')

In [None]:
def column_index(df, query_cols):
  '''
  returns the indices of the query_cols
  '''
  cols = df.columns.astype(str)
  sidx = np.argsort(cols)
  return sidx[np.searchsorted(cols,query_cols.astype(str),sorter=sidx)]


cat_cols = X.iloc[:, [1,2]]
categorical_indices = column_index(X, cat_cols.columns)
categorical = list(categorical_indices)

In [None]:
print(cat_cols)

In [None]:
print(X)

In [None]:
weekly_groups = X.groupby(pd.Grouper(key='date', freq='W-WED'))

In [None]:
data_for_clustering['hospcode'] = data_for_clustering['hospcode'].astype(str)
data_for_clustering['agegroup'] = data_for_clustering['agegroup'].astype(str)
categorical_columns = data_for_clustering.select_dtypes(include='object').columns
categorical_indices = [data_for_clustering.columns.get_loc(col) for col in categorical_columns]



In [None]:
data_for_clustering = weekly_data.drop(['date', 'cc'], axis=1)

## Using the elbow method to find the optimal number of clusters

In [None]:
import matplotlib.pyplot as plt
wcss = []
for i in range(5, 30):
    kproto = KPrototypes(n_clusters=i, init='Cao',
                     n_jobs = 1, verbose=0,
                     random_state=42)
    kproto.fit_predict(X, categorical=categorical)
    wcss.append(kproto.cost_)
plt.plot(range(5, 30), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

## Training the K-Means model on the dataset

In [None]:

clustered_data = []

# Step 4: Loop through each group of weekly data
for week_start, weekly_data in weekly_groups:
    kproto = KPrototypes(n_clusters=12, init='Cao',
                     n_jobs = 4, verbose=0,
                     random_state=42)
    data_for_clustering = weekly_data.drop(['date', 'cc'], axis=1)
    data_for_clustering['hospcode'] = data_for_clustering['hospcode'].astype(str)
    data_for_clustering['agegroup'] = data_for_clustering['agegroup'].astype(str)
    data_for_clustering = data_for_clustering.dropna()
    categorical_columns = data_for_clustering.select_dtypes(include='object').columns
    categorical_indices = [data_for_clustering.columns.get_loc(col) for col in categorical_columns]
    # Fit and predict clusters
    clusters = kproto.fit_predict(data_for_clustering, categorical=categorical_indices)

    weekly_data_cleaned = weekly_data.loc[data_for_clustering.index]  # Align with rows used in clustering
    weekly_data_cleaned.loc[:, 'cluster'] = clusters  # Add the clusters only to cleaned data

    # Append the cleaned weekly data with clusters to the list
    clustered_data.append(weekly_data_cleaned)

# Concatenate all weekly clustered data into a single DataFrame
clustered_dataset = pd.concat(clustered_data)

# Step 6: Display the result
print(clustered_dataset)

In [None]:
len(clustered_data)

In [None]:
X = pd.merge(X, pd.DataFrame(clusters), left_index=True, right_index=True) # Merge based on indices

In [None]:
X.rename(columns={'0_y': 'cluster', '0_x': '0'}, inplace=True)
X.to_csv('clustered.csv')

## Visualising the clusters in 2D

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Assuming X is a pandas DataFrame, convert it to a NumPy array
#X_dense = X.values

# Reduce dimensions (here using PCA for demonstration; consider t-SNE or MDS for better handling of categorical variables)
pca = PCA(n_components=2)
vectors_pca = pca.fit_transform(vectors) # Pass the dense array here

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np

# Define the number of columns for subplots (adjust based on how many weekly plots you want per row)
n_cols = 3  # Number of subplots per row
n_weeks = len(clustered_data)  # Total number of weekly groups

# Calculate the number of rows needed
n_rows = int(np.ceil(n_weeks / n_cols))

# Create subplots
fig, axs = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))  # Adjust figure size based on rows and cols
axs = axs.flatten()  # Flatten axes for easier indexing

# Initialize LabelEncoder for categorical data
encoder = LabelEncoder()

# Loop over each weekly dataset and visualize clusters
for i, weekly_data in enumerate(clustered_data):
    # Extract features and clusters
    data_for_clustering = weekly_data.drop(columns=['cluster', 'date', 'cc'])

    # Apply Label Encoding for categorical columns (no one-hot encoding)
    categorical_columns = ['hospcode', 'agegroup']  # Replace with actual categorical columns
    for col in categorical_columns:
        data_for_clustering[col] = encoder.fit_transform(data_for_clustering[col])

    # Extract numeric values (including encoded categorical data)
    vectors = data_for_clustering.values  # Use the full dataset with label-encoded categorical variables
    clusters = weekly_data['cluster'].values  # Cluster labels

    # Apply PCA (or t-SNE) to reduce dimensions to 2
    pca = PCA(n_components=2)
    vectors_pca = pca.fit_transform(vectors)  # Reduce dimensions to 2

    # Scatter plot of clusters
    axs[i].scatter(vectors_pca[:, 0], vectors_pca[:, 1], c=clusters, cmap='viridis', label='Cluster ID')
    axs[i].set_title(f'Week {i + 1}')
    axs[i].set_xlabel('PC1')
    axs[i].set_ylabel('PC2')

# Hide any empty subplots if they exist
for j in range(i + 1, len(axs)):
    fig.delaxes(axs[j])

plt.tight_layout()
plt.show()


## Visualising the clusters in 3D

In [None]:
# Apply PCA to reduce dimensions to three
pca = PCA(n_components=3)
vectors_pca = pca.fit_transform(vectors)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

# Define the number of columns for subplots (adjust based on how many weekly plots you want per row)
n_cols = 3  # Number of subplots per row
n_weeks = len(clustered_data)  # Total number of weekly groups

# Calculate the number of rows needed
n_rows = int(np.ceil(n_weeks / n_cols))

# Create subplots with 3D projection
fig = plt.figure(figsize=(15, 5 * n_rows))

# Initialize LabelEncoder for categorical data
encoder = LabelEncoder()

# Loop over each weekly dataset and visualize clusters
for i, weekly_data in enumerate(clustered_data):
    # Extract features and clusters
    data_for_clustering = weekly_data.drop(columns=['cluster', 'date', 'cc'])

    # Apply Label Encoding for categorical columns (no one-hot encoding)
    categorical_columns = ['hospcode', 'agegroup']  # Replace with actual categorical columns
    for col in categorical_columns:
        data_for_clustering[col] = encoder.fit_transform(data_for_clustering[col])

    # Extract numeric values (including encoded categorical data)
    vectors = data_for_clustering.values  # Use the full dataset with label-encoded categorical variables
    clusters = weekly_data['cluster'].values  # Cluster labels

    # Apply PCA to reduce dimensions to 3
    pca = PCA(n_components=3)
    vectors_pca = pca.fit_transform(vectors)  # Reduce dimensions to 3

    # Add a 3D subplot for each week
    ax = fig.add_subplot(n_rows, n_cols, i + 1, projection='3d')

    # Scatter plot in 3D
    scatter = ax.scatter(vectors_pca[:, 0], vectors_pca[:, 1], vectors_pca[:, 2], c=clusters, cmap='viridis', label='Cluster ID')

    # Set titles and labels
    ax.set_title(f'Week {i + 1}')
    ax.set_xlabel('Principal Component 1')
    ax.set_ylabel('Principal Component 2')
    ax.set_zlabel('Principal Component 3')

# Add a color bar
fig.colorbar(scatter, ax=ax, shrink=0.5, aspect=5)

plt.tight_layout()
plt.show()
