In [None]:
import re

In [None]:
# Function to load and extract text from a .docx file
def load_docx(file_path):
    doc = Document(file_path)
    data = []
    for table in doc.tables:
        for row in table.rows:
            # Extract the cells' text
            cells = [cell.text for cell in row.cells]
            data.append(cells)
    return data

In [None]:
# Function to identify if the complaint is related to tuberculosis
def is_tuberculosis_related(complaint):
    keywords = ['tuber']
    return any(re.search(keyword, complaint.lower()) for keyword in keywords)


In [None]:
import pandas as pd
# Load the document
data = pd.read_csv('/content/clustered_DBSCAN_eps=eps_50_4_wed.csv')  # Replace with your actual file path



In [None]:
print(data)

In [None]:
# Assuming the Chief Complaint is in the 4th column and Cluster Label in the 5th column
#chief_complaints = [row[3] for row in data if len(row) > 1 and row[3]!= ("cc")]
cluster_labels = data['cluster']
chief_complaints = data['text']


In [None]:
len(cluster_labels)

In [None]:
print(cluster_labels)

# Silhouette score

In [None]:
#!pip install gower
import pandas as pd
import gower
from sklearn.metrics import silhouette_samples

# Assuming 'data_vectorized' has a 'date' column for timestamps and a 'cluster_labels' column
# Drop columns that should not be part of clustering
data_for_clustering = data
data_for_clustering['week_start'] = pd.to_datetime(data_for_clustering['week_start'])
weekly_groups = data_for_clustering.groupby('week_start')
# Convert 'date' column to datetime if not already
#data_for_clustering['date'] = pd.to_datetime(data_for_clustering['date'])

# Group by week using pandas Grouper
#weekly_groups = data_for_clustering.groupby(pd.Grouper(key='date', freq='W-MON'))


In [None]:
!pip install gower
import pandas as pd
import numpy as np
import gower
from sklearn.metrics import silhouette_samples, silhouette_score

clustered_dataset = data

def parse_vector_string(vector_str):
    # Convert string representation of vector to numpy array
    cleaned = vector_str.strip('[]').replace('\n', ' ')
    return np.array([float(x) for x in cleaned.split()])

weekly_cc_per_cluster = {}

# Loop through each week's data
for week_start, week_data in clustered_dataset.groupby('week_start'):
    print(f"\nProcessing week: {week_start}")

    try:
        # Parse vectors from strings and stack them
        vectors = np.vstack([parse_vector_string(str(v)) for v in week_data['reduced_vector'].values])

        # Convert to DataFrame for gower distance
        vectors_df = pd.DataFrame(
            vectors,
            columns=[f'dim_{i}' for i in range(vectors.shape[1])]
        )

        # Get cluster labels
        week_cluster_labels = week_data['cluster'].values

        # Compute silhouette scores, ignoring noise points (-1)
        mask = week_cluster_labels != -1
        non_noise_labels = set(week_cluster_labels[mask])

        if len(non_noise_labels) >= 2 and mask.sum() > 0:
            # Compute Gower distance matrix for non-noise points
            print(f"Computing Gower distances for {sum(mask)} non-noise points")

            # Select only non-noise points for distance calculation
            vectors_df_filtered = vectors_df[mask].copy()

            # Specify all columns as numeric for Gower distance
            gower_distances = gower.gower_matrix(vectors_df_filtered)

            # Calculate silhouette scores
            silhouette_vals = silhouette_samples(
                gower_distances,
                week_cluster_labels[mask],
                metric="precomputed"
            )

            # Calculate scores per cluster
            cluster_silhouette_scores = {}
            for cluster in non_noise_labels:
                cluster_mask = week_cluster_labels[mask] == cluster
                cluster_silhouette_vals = silhouette_vals[cluster_mask]
                mean_silhouette_score = cluster_silhouette_vals.mean()
                cluster_silhouette_scores[cluster] = mean_silhouette_score

                # Get the texts for this cluster
                cluster_texts = week_data[week_data['cluster'] == cluster]['text'].values

                print(f"\nCluster {cluster}:")
                print(f"Silhouette Score: {mean_silhouette_score:.3f}")
                print(f"Number of texts: {len(cluster_texts)}")
                print(f"Score range: {cluster_silhouette_vals.min():.3f} to {cluster_silhouette_vals.max():.3f}")

                # Calculate mean Gower distance within cluster
                cluster_indices = np.where(week_cluster_labels[mask] == cluster)[0]
                if len(cluster_indices) > 1:
                    cluster_distances = gower_distances[cluster_indices][:, cluster_indices]
                    mean_dist = np.mean(cluster_distances[np.triu_indices_from(cluster_distances, k=1)])
                    print(f"Mean internal Gower distance: {mean_dist:.3f}")

                print("Sample texts:")
                print(cluster_texts[:3])

            # Store results for clusters
            cc_per_cluster = {}
            for cluster, score in cluster_silhouette_scores.items():
                cluster_texts = week_data[week_data['cluster'] == cluster]['text'].values

                cc_per_cluster[cluster] = {
                    'texts': cluster_texts,
                    'silhouette_score': score,
                    'size': len(cluster_texts),
                    'sample_texts': cluster_texts[:5]
                }

            # Calculate overall silhouette score for the week
            overall_score = silhouette_score(
                gower_distances,
                week_cluster_labels[mask],
                metric="precomputed"
            )
            print(f"\nOverall week silhouette score: {overall_score:.3f}")

        else:
            print(f"Insufficient unique clusters or samples for week {week_start}")
            print(f"Number of non-noise points: {sum(mask)}")
            print(f"Number of unique non-noise clusters: {len(non_noise_labels)}")
            cc_per_cluster = {}

    except Exception as e:
        print(f"Error processing week {week_start}:")
        print(f"Error details: {str(e)}")
        print("Data shape:", vectors.shape if 'vectors' in locals() else "No vectors")
        cc_per_cluster = {}

    weekly_cc_per_cluster[week_start] = cc_per_cluster

# Print summary statistics
print("\nOverall Results Summary:")
for week, clusters in weekly_cc_per_cluster.items():
    if clusters:
        print(f"\nWeek {week}")
        print(f"Number of clusters: {len(clusters)}")

        scores = [info['silhouette_score'] for info in clusters.values()]
        if scores:
            print(f"Average silhouette score: {np.mean(scores):.3f}")
            print(f"Min silhouette score: {min(scores):.3f}")
            print(f"Max silhouette score: {max(scores):.3f}")

            # Calculate cluster sizes
            sizes = [info['size'] for info in clusters.values()]
            print(f"Average cluster size: {np.mean(sizes):.1f}")
            print(f"Min cluster size: {min(sizes)}")
            print(f"Max cluster size: {max(sizes)}")

        for cluster_id, cluster_info in clusters.items():
            print(f"\n  Cluster {cluster_id}:")
            print(f"  Size: {cluster_info['size']}")
            print(f"  Silhouette Score: {cluster_info['silhouette_score']:.3f}")
            if len(cluster_info['sample_texts']) > 0:
                print("  Sample text:", cluster_info['sample_texts'][0])