# Reading Data from Text Files

In [1]:
import os
import numpy as np
from sklearn.decomposition import PCA

data_directory = "/home/hussein/PatternDatasets/Activities"
training_data_means = []
evaluation_data_means = []
flattened_training_data = []
flattened_evaluation_data = []
training_labels = []
evaluation_labels = []

# Step 1: Accessing the Data Directory
activities = os.listdir(data_directory)

# Step 2: Iterating Through Activity Folders
for activity in activities:
    activity_path = os.path.join(data_directory, activity)
    activity_number = int(activity.split("a")[1])
    
    print("Activity Number:", activity_number)
    print("Activity:", activity)
    training_labels.extend([activity_number] * 48)  # 48 segments for training
    evaluation_labels.extend([activity_number] * 12)  # 12 segments for evaluation
    
    
    # Step 3: Iterating Through Subject Folders
    subjects = os.listdir(activity_path)
    for subject in subjects:
        subject_path = os.path.join(activity_path, subject)
        
        # Step 4: Reading Text Files (Segments)
        segments = os.listdir(subject_path)
        training_segments = segments[:48]  # First 48 segments for training
        evaluation_segments = segments[48:]  # Rest for evaluation
        
        for segment_file in training_segments:
            segment_file_path = os.path.join(subject_path, segment_file)
            with open(segment_file_path, 'r') as file:
                segment_data = np.loadtxt(file, delimiter=',')
                mean_data = np.mean(segment_data, axis=0)  # Taking mean along columns
                training_data_means.append(mean_data)
                
                flattened_data = segment_data.flatten()  # Flattening the segment
                flattened_training_data.append(flattened_data)
        
        for segment_file in evaluation_segments:
            segment_file_path = os.path.join(subject_path, segment_file)
            with open(segment_file_path, 'r') as file:
                segment_data = np.loadtxt(file, delimiter=',')
                mean_data = np.mean(segment_data, axis=0)  # Taking mean along columns
                evaluation_data_means.append(mean_data)
                
                flattened_data = segment_data.flatten()  # Flattening the segment
                flattened_evaluation_data.append(flattened_data)

# Convert the data lists into numpy arrays
training_data_means = np.array(training_data_means)
evaluation_data_means = np.array(evaluation_data_means)
flattened_training_data = np.array(flattened_training_data)
flattened_evaluation_data = np.array(flattened_evaluation_data)
PCA_training_data = PCA(n_components=0.9).fit_transform(flattened_training_data)
PCA_evaluation_data = PCA(n_components=0.9).fit_transform(flattened_evaluation_data)


# Now, 'training_data_means' and 'evaluation_data_means' contain mean values of each column for each segment,
# and 'flattened_training_data' and 'flattened_evaluation_data' contain flattened data for each segment.
print("Shape of training data (means):", training_data_means.shape)
print("Shape of evaluation data (means):", evaluation_data_means.shape)
print("Shape of flattened training data:", flattened_training_data.shape)
print("Shape of flattened evaluation data:", flattened_evaluation_data.shape)
print("Shape of PCA training data:", PCA_training_data.shape)
print("Shape of PCA evaluation data:", PCA_evaluation_data.shape)
print("Training Labels:", training_labels)
print("Evaluation Labels:", evaluation_labels)


Activity Number: 11
Activity: a11
Activity Number: 13
Activity: a13
Activity Number: 18
Activity: a18
Activity Number: 3
Activity: a03
Activity Number: 5
Activity: a05
Activity Number: 4
Activity: a04
Activity Number: 19
Activity: a19
Activity Number: 6
Activity: a06
Activity Number: 7
Activity: a07
Activity Number: 8
Activity: a08
Activity Number: 2
Activity: a02
Activity Number: 1
Activity: a01
Activity Number: 12
Activity: a12
Activity Number: 15
Activity: a15
Activity Number: 17
Activity: a17
Activity Number: 14
Activity: a14
Activity Number: 16
Activity: a16
Activity Number: 10
Activity: a10
Activity Number: 9
Activity: a09
Shape of training data (means): (7296, 45)
Shape of evaluation data (means): (1824, 45)
Shape of flattened training data: (7296, 5625)
Shape of flattened evaluation data: (1824, 5625)
Shape of PCA training data: (7296, 346)
Shape of PCA evaluation data: (1824, 197)
Training Labels: [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,

# Kmeans Clustering implementation

In [2]:
class Kmeans:
    def __init__(self, n_clusters, max_iter=100, tol=1e-4):
        self.centroids = None
        self.cluster_centers_ = None
        self.labels_ = None
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.inertia_ = None
        np.random.seed(42)

    def fit(self, data):
        # Initialize centroids randomly
        labels= None
        centroids_idx = np.random.choice(data.shape[0], self.n_clusters, replace=False)
        self.centroids = data[centroids_idx]

        for _ in range(self.max_iter):
            # Assignment step
            distances = np.linalg.norm(data[:, None] - self.centroids, axis=2)
            labels = np.argmin(distances, axis=1)

            # Update step
            new_centroids = np.array([data[labels == k].mean(axis=0) for k in range(self.n_clusters)])

            # Check convergence
            if np.linalg.norm(new_centroids - self.centroids) < self.tol:
                break

            self.centroids = new_centroids

        self.labels_ = labels
        self.cluster_centers_ = self.centroids

        # Calculate inertia
        self.inertia_ = self._calculate_inertia(data)

    def _calculate_inertia(self, data):
        inertia = 0
        for i in range(self.n_clusters):
            cluster_points = data[self.labels_ == i]
            distances = np.linalg.norm(cluster_points - self.centroids[i], axis=1)
            inertia += np.sum(distances**2)
        return inertia

    def predict(self, data):
        distances = np.linalg.norm(data[:, None] - self.centroids, axis=2)
        return np.argmin(distances, axis=1)

In [3]:
kmeans = Kmeans(n_clusters=6, max_iter=300, tol=1e-4)
kmeans.fit(training_data_means)
print(np.unique(kmeans.labels_, return_counts=True))
print("Clusters:", kmeans.labels_)
print("Centroids:", kmeans.cluster_centers_)
print("Inertia:", kmeans.inertia_)


(array([0, 1, 2, 3, 4, 5]), array([1736, 1844, 1242,  925, 1149,  400]))
Clusters: [4 4 4 ... 0 0 0]
Centroids: [[ 9.02138345e+00 -7.77587877e-01  3.35276679e+00 -7.70494068e-03
   1.58928155e-02 -4.30034683e-03 -6.53435642e-01 -8.37805096e-03
  -2.32824228e-01  8.86818504e+00  3.59023264e+00  1.30463426e+00
   2.10022828e-03 -1.31914934e-02 -1.35263446e-03 -5.30675811e-01
  -2.42719076e-01 -7.93707195e-02  8.86302835e+00 -3.22681177e+00
   2.17929833e+00 -1.18398046e-02 -1.06217947e-02 -4.41276360e-03
  -5.97756960e-01  2.23894125e-01 -1.81253990e-01 -9.82260559e+00
  -2.20588799e-02 -1.06611746e+00  2.62458450e-02  4.67606452e-02
  -1.75063378e-03  5.77915213e-01 -4.51030764e-02  8.34710486e-04
  -9.83436408e+00 -3.64540703e-01 -9.49394798e-01 -9.79048095e-03
   4.32269751e-02 -3.10495224e-03  6.04861449e-01  7.89422467e-02
   1.55054232e-01]
 [ 6.22424394e+00 -3.18357733e-01  4.38145124e+00  1.15763401e-03
   1.79971156e-02 -1.52216001e-03 -5.26055898e-01  2.01266193e-02
  -2.856993

In [4]:
from sklearn import metrics
silhouette_score = metrics.silhouette_score(training_data_means, kmeans.labels_, metric='euclidean')
print("Silhouette Score:", silhouette_score)

Silhouette Score: 0.20184448256879942
