# Reading Data from Text Files

In [ ]:
import os
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from Evaluation import compute_clusters, compute_purity, compute_recall, compute_f1, compute_entropy

# Reading Data from Text Files

In [ ]:
data_directory = "data"
training_data_means = []
evaluation_data_means = []
flattened_training_data = []
flattened_evaluation_data = []
training_labels = []
evaluation_labels = []

# Step 1: Accessing the Data Directory
activities = os.listdir(data_directory)

# Step 2: Iterating Through Activity Folders
for activity in activities:
    activity_path = os.path.join(data_directory, activity)
    activity_number = int(activity.split("a")[1])
    
    print("Activity Number:", activity_number)
    print("Activity:", activity)
    training_labels.extend([activity_number] * 48 * 8)  # 48 segments for training
    evaluation_labels.extend([activity_number] * 12 * 8)  # 12 segments for evaluation
    
    
    # Step 3: Iterating Through Subject Folders
    subjects = os.listdir(activity_path)
    for subject in subjects:
        subject_path = os.path.join(activity_path, subject)
        
        # Step 4: Reading Text Files (Segments)
        segments = os.listdir(subject_path)
        training_segments = segments[:48]  # First 48 segments for training
        evaluation_segments = segments[48:]  # Rest for evaluation
        
        for segment_file in training_segments:
            segment_file_path = os.path.join(subject_path, segment_file)
            with open(segment_file_path, 'r') as file:
                segment_data = np.loadtxt(file, delimiter=',')
                mean_data = np.mean(segment_data, axis=0)  # Taking mean along columns
                training_data_means.append(mean_data)
                
                flattened_data = segment_data.flatten()  # Flattening the segment
                flattened_training_data.append(flattened_data)
        
        for segment_file in evaluation_segments:
            segment_file_path = os.path.join(subject_path, segment_file)
            with open(segment_file_path, 'r') as file:
                segment_data = np.loadtxt(file, delimiter=',')
                mean_data = np.mean(segment_data, axis=0)  # Taking mean along columns
                evaluation_data_means.append(mean_data)
                
                flattened_data = segment_data.flatten()  # Flattening the segment
                flattened_evaluation_data.append(flattened_data)

# Convert the data lists into numpy arrays
training_data_means = np.array(training_data_means)
evaluation_data_means = np.array(evaluation_data_means)
flattened_training_data = np.array(flattened_training_data)
flattened_evaluation_data = np.array(flattened_evaluation_data)
PCA_reduction=PCA(n_components=0.9)
PCA_training_data = PCA_reduction.fit_transform(flattened_training_data)
PCA_evaluation_data = PCA_reduction.transform(flattened_evaluation_data)


# Now, 'training_data_means' and 'evaluation_data_means' contain mean values of each column for each segment,
# and 'flattened_training_data' and 'flattened_evaluation_data' contain flattened data for each segment.
print("Shape of training labels", np.array(training_labels).shape)
print("Shape of evaluation labels", np.array(evaluation_labels).shape)
print("Shape of training data (means):", training_data_means.shape)
print("Shape of evaluation data (means):", evaluation_data_means.shape)
print("Shape of flattened training data:", flattened_training_data.shape)
print("Shape of flattened evaluation data:", flattened_evaluation_data.shape)
print("Shape of PCA training data:", PCA_training_data.shape)
print("Shape of PCA evaluation data:", PCA_evaluation_data.shape)


# Kmeans Clustering implementation

In [ ]:
class Kmeans:
    def __init__(self, n_clusters, max_iter=300, tol=1e-4):
        self.centroids = None
        self.cluster_centers_ = None
        self.clusters = None
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.inertia_ = None
        np.random.seed(42)

    def fit(self, data):
        # Initialize centroids randomly
        clusters = None
        centroids_idx = np.random.choice(data.shape[0], self.n_clusters, replace=False)
        self.centroids = data[centroids_idx]

        for _ in range(self.max_iter):
            # Assignment step
            distances = np.linalg.norm(data[:, None] - self.centroids, axis=2)
            clusters = np.argmin(distances, axis=1)

            # Update step
            new_centroids = np.array([data[clusters == k].mean(axis=0) for k in range(self.n_clusters)])

            # Check convergence
            if np.linalg.norm(new_centroids - self.centroids) < self.tol:
                break

            self.centroids = new_centroids

        self.clusters = clusters
        self.cluster_centers_ = self.centroids

        # Calculate inertia
        self.inertia_ = self._calculate_inertia(data)

    def _calculate_inertia(self, data):
        inertia = 0
        for i in range(self.n_clusters):
            cluster_points = data[self.clusters == i]
            distances = np.linalg.norm(cluster_points - self.centroids[i], axis=1)
            inertia += np.sum(distances**2)
        return inertia

    def predict(self, data):
        distances = np.linalg.norm(data[:, None] - self.centroids, axis=2)
        return np.argmin(distances, axis=1)

# Applying Kmeans Clustering on the first approach

In [ ]:
Ks = [8, 13, 19, 28, 38]
means_score = []
means_inertia = []
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(training_data_means)
    means_score.append(metrics.silhouette_score(training_data_means, kmeans.clusters, metric='euclidean'))
    means_inertia.append(kmeans.inertia_)
    print("Silhouette Score:", means_score[-1])
    print("Inertia:", kmeans.inertia_)


# Applying Kmeans Clustering on the second approach

In [ ]:
reduce_score = []
reduce_inertia = []
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(PCA_training_data)
    reduce_score.append(metrics.silhouette_score(PCA_training_data, kmeans.clusters, metric='euclidean'))
    reduce_inertia.append(kmeans.inertia_)
    print("Silhouette Score:", reduce_score[-1])
    print("Inertia:", kmeans.inertia_)

# Plotting the results

## Silhouette Score vs Number of Clusters

In [ ]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(Ks, means_score, marker='o', label='Mean Data')
plt.plot(Ks, reduce_score, marker='x', label='PCA Data')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.show()

From the plot, we see that the silhouette score is higher for the mean data compared to the PCA data. This indicates that the mean data is more suitable for clustering compared to the PCA data. The silhouette score is highest for 38 clusters in mean data case while the highest for 8 clusters in the PCA data case.

## Inertia vs Number of Clusters

In [ ]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 2)
plt.plot(Ks, means_inertia, marker='o', label='Mean Data')
plt.plot(Ks, reduce_inertia, marker='x', label='PCA Data')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Inertia vs Number of Clusters')
plt.show()

From the plot, we see that the inertia is higher for the PCA data compared to the mean data. This indicates that the mean data is more suitable for clustering compared to the PCA data. The inertia is highest for eight clusters in both cases.

# Conclusion

We observed that the mean data is more suitable for clustering compared to the PCA data because of the following:
1- The silhouette score is higher for the mean data compared to the PCA data.
2- The inertia is lower for the mean data compared to the PCA data.

# Evaluation

## Purity

In [ ]:
# Training data + Means approach
print("Training data + Means approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(training_data_means)
    print(f'# clusters = {K}:', compute_purity(compute_clusters(training_labels, kmeans.clusters, K), len(training_data_means)))

# Training data + PCA approach
print("Training data + PCA approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(PCA_training_data)
    print(f'# clusters = {K}:', compute_purity(compute_clusters(training_labels, kmeans.clusters, K), len(PCA_training_data)))

# Evaluation data + Means approach
print("Evaluation data + Means approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(evaluation_data_means)
    print(f'# clusters = {K}:', compute_purity(compute_clusters(evaluation_labels, kmeans.clusters, K), len(evaluation_data_means)))

# Evaluation data + PCA approach
print("Evaluation data + PCA approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(PCA_evaluation_data)
    print(f'# clusters = {K}:', compute_purity(compute_clusters(evaluation_labels, kmeans.clusters, K), len(PCA_evaluation_data)))

## Recall

In [ ]:
# Training data + Means approach
print("Training data + Means approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(training_data_means)
    print(f'# clusters = {K}:', compute_recall(compute_clusters(training_labels, kmeans.clusters, K), len(training_data_means)))

# Training data + PCA approach
print("Training data + PCA approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(PCA_training_data)
    print(f'# clusters = {K}:', compute_recall(compute_clusters(training_labels, kmeans.clusters, K), len(PCA_training_data)))

# Evaluation data + Means approach
print("Evaluation data + Means approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(evaluation_data_means)
    print(f'# clusters = {K}:', compute_recall(compute_clusters(evaluation_labels, kmeans.clusters, K), len(evaluation_data_means)))

# Evaluation data + PCA approach
print("Evaluation data + PCA approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(PCA_evaluation_data)
    print(f'# clusters = {K}:', compute_recall(compute_clusters(evaluation_labels, kmeans.clusters, K), len(PCA_evaluation_data)))

## F1 Score

In [ ]:
# Training data + Means approach
print("Training data + Means approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(training_data_means)
    print(f'# clusters = {K}:', compute_f1(compute_clusters(training_labels, kmeans.clusters, K)))

# Training data + PCA approach
print("Training data + PCA approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(PCA_training_data)
    print(f'# clusters = {K}:', compute_f1(compute_clusters(training_labels, kmeans.clusters, K)))

# Evaluation data + Means approach
print("Evaluation data + Means approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(evaluation_data_means)
    print(f'# clusters = {K}:', compute_f1(compute_clusters(evaluation_labels, kmeans.clusters, K)))

# Evaluation data + PCA approach
print("Evaluation data + PCA approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(PCA_evaluation_data)
    print(f'# clusters = {K}:', compute_f1(compute_clusters(evaluation_labels, kmeans.clusters, K)))

## Conditional Entropy

In [ ]:
# Training data + Means approach
print("Training data + Means approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(training_data_means)
    print(f'# clusters = {K}:', compute_entropy(compute_clusters(training_labels, kmeans.clusters, K), len(training_data_means)))

# Training data + PCA approach
print("Training data + PCA approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(PCA_training_data)
    print(f'# clusters = {K}:', compute_entropy(compute_clusters(training_labels, kmeans.clusters, K), len(PCA_training_data)))

# Evaluation data + Means approach
print("Evaluation data + Means approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(evaluation_data_means)
    print(f'# clusters = {K}:', compute_entropy(compute_clusters(evaluation_labels, kmeans.clusters, K), len(evaluation_data_means)))

# Evaluation data + PCA approach
print("Evaluation data + PCA approach")
for K in Ks:
    kmeans = Kmeans(n_clusters=K)
    kmeans.fit(PCA_evaluation_data)
    print(f'# clusters = {K}:', compute_entropy(compute_clusters(evaluation_labels, kmeans.clusters, K), len(PCA_evaluation_data)))