# Aufgabe 1

In [1]:
import numpy as np

# Daten und Wahrscheinlichkeiten
data = [
    ("HTTTHTTHTH", 0.4),
    ("HHHHTHHHHH", 0.9),
    ("HTHHHHHTHH", 0.8),
    ("HTTTTTHHTT", 0.3),
    ("THHHTHHHTH", 0.7),
]
theta_A = 0.4
theta_B = 0.8

def compute_hidden_matrix(data, theta_A, theta_B):
    hidden_matrix = []

    for sequence, observed_prob in data:
        prob_A = 1.0
        prob_B = 1.0
        
        for toss in sequence:
            if toss == "H":
                prob_A *= theta_A
                prob_B *= theta_B
            else:
                prob_A *= (1 - theta_A)
                prob_B *= (1 - theta_B)
        
        # Normalisieren
        total_prob = prob_A + prob_B
        hidden_matrix.append((prob_A / total_prob, prob_B / total_prob))
    
    return np.array(hidden_matrix)

hidden_matrix = compute_hidden_matrix(data, theta_A, theta_B)
print("HiddenMatrix:")
print(hidden_matrix)


HiddenMatrix:
[[0.97852349 0.02147651]
 [0.00582524 0.99417476]
 [0.03396226 0.96603774]
 [0.99635535 0.00364465]
 [0.17419355 0.82580645]]


# Aufgabe 2

Mit Hilfe welches Kriteriums können Sie die Iteration beim Expectation Maximization
abbrechen

Wenn die Konvergenz der Log-Likehood erreicht ist.

wenn die maximale Anzahl an Iterationen überschritten wird, falls keine Konvergenz eintritt.


# Aufgabe 3

In [2]:
import random

# Funktion zur Berechnung der HiddenMatrix
def compute_hidden_matrix_soft(data, centers, beta=1):
    hidden_matrix = []
    for point in data:
        distances = [np.linalg.norm(point - center) for center in centers]
        weights = np.exp(-beta * np.array(distances))
        weights /= np.sum(weights)
        hidden_matrix.append(weights)
    return np.array(hidden_matrix)

# M-Schritt: Zentren aktualisieren
def update_centers(data, hidden_matrix):
    k = hidden_matrix.shape[1]
    new_centers = []
    for i in range(k):
        weights = hidden_matrix[:, i]
        new_center = np.sum(data.T * weights, axis=1) / np.sum(weights)
        new_centers.append(new_center)
    return np.array(new_centers)

# Soft k-Means Algorithmus
def soft_k_means(data, k, beta=1, max_iters=100, tol=1e-4):
    # Initialisiere zufällige Zentren
    random_indices = random.sample(range(len(data)), k)
    centers = np.array([data[i] for i in random_indices])
    
    for iteration in range(max_iters):
        hidden_matrix = compute_hidden_matrix_soft(data, centers, beta)
        new_centers = update_centers(data, hidden_matrix)
        
        # Prüfe Konvergenz
        if np.linalg.norm(new_centers - centers) < tol:
            break
        centers = new_centers
    
    return hidden_matrix, centers

# Hauptprogramm
data = np.random.rand(231, 10)  # Beispiel-Daten (231 Punkte mit 10 Dimensionen)
hidden_matrix, final_centers = soft_k_means(data, k=6, beta=1)

# Speichern der HiddenMatrix
np.savetxt("HM.txt", hidden_matrix)


# Aufgabe 4

In [3]:
# Anzahl der Gene pro Cluster
def genes_per_cluster(hidden_matrix):
    cluster_assignments = np.argmax(hidden_matrix, axis=1)
    unique, counts = np.unique(cluster_assignments, return_counts=True)
    cluster_sizes = dict(zip(unique, counts))
    return cluster_sizes

cluster_sizes = genes_per_cluster(hidden_matrix)
for cluster, size in cluster_sizes.items():
    print(f"Cluster {cluster}: {size} members")


Cluster 0: 17 members
Cluster 1: 54 members
Cluster 2: 45 members
Cluster 3: 67 members
Cluster 4: 28 members
Cluster 5: 20 members
