**Kohonen-SOM**
- Oğuzhan Nejat Karabaş


# Gerekli Kütüphaneler

In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import random
from scipy.spatial.distance import euclidean
from sklearn.metrics import accuracy_score

# Veri setimizi yükleme ve ön işleme işlemleri
- Veri setini "dataset.xlsx" dosyasından okuyacağız.
- Min-max normalizasyonu uygulayarak 0-255 arası sayısal değerleri olan 784 özniteliğin hepsinin de 0-1 aralığına dönüştüreceğiz.

In [24]:
# "dataset.xlsx" dosyasını okuyarak DataFrame oluşturuyoruz.
data = pd.read_excel("dataset.xlsx", header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Normalizasyon**

In [25]:
data = pd.read_excel("dataset.xlsx")
# Min-Max normalizasyonunu uygulayarak değerlerimizi 0-1 aralığına indirgiyoruz
scaler = MinMaxScaler(feature_range=(0, 1))
normalized_data = scaler.fit_transform(data)

# normalize ettiğimiz datayı, dataframe haline getiriyoruz.
normalized_df = pd.DataFrame(normalized_data, columns=data.columns)

In [26]:
normalized_df

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Kohonen SOM Modeli
- Bu aşamada elimizdeki 4 farklı küme için bir Kohonen SOM modeli oluşturacağız. Sonrasında bu ağı, belirli bir sayıda tur (epoch) boyunca eğiteceğiz.
- SOM modelimiz için parametrelerimizi belirleyelim:


1.   Öğrenme hızı = 0.5
2.   Sigma = 1.0
3.   Tur sayısı(epoch) = 100
4.   Küme sayısı = 4




  

In [27]:
# Kohonen SOM modeli için parametreler
grid_size = 4
learning_rate = 0.5
sigma = 1.0
num_epochs = 100

# İlk ağırlık değerlerini rastgele atama işlemi
weights = np.random.rand(grid_size, grid_size, normalized_df.shape[1])

# Gaussian Bell fonksiyonu
def gaussian_bell(x, mean, sigma):
    return np.exp(-((x - mean)**2) / (2 * (sigma**2)))

# Ağırlık vektörlerini güncelleme
def update_weights(weights, bmu, data_point, learning_rate, sigma):
    for i in range(weights.shape[0]):
        for j in range(weights.shape[1]):
            distance = euclidean((i, j), bmu)
            h = gaussian_bell(distance, 0, sigma)
            weights[i, j] += h * learning_rate * (data_point - weights[i, j])

# Best Matching Unit (BMU) bulma
def find_bmu(data_point, weights):
    min_distance = float("inf")
    bmu = None
    for i in range(weights.shape[0]):
        for j in range(weights.shape[1]):
            distance = euclidean(data_point, weights[i, j])
            if distance < min_distance:
                min_distance = distance
                bmu = (i, j)
    return bmu

**Modeli Eğitme**

- Modelimizi oluşturduktan sonra eğitmemiz gerekiyor.
- Her turda, veri setimizdeki her örnek için:


1.   Öklidyen uzaklığı kullanarak en yakın ağırlığı yani Best Matching Unit (BMU) buluyoruz.
2.   Gaussian Bell kullanarak h(x) topolojik komşu fonksiyonunu hesaplıyoruz.
3.   Ağırlıkları güncelliyoruz: BMU ve komşu ağırlıklarını, öğrenme hızı ve h(x) değerlerine göre güncelliyoruz.
4. Öğrenme hızı ve sigma değerlerimizi azaltıyoruz. ( decay)



In [28]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    # Her veri noktası için
    for index, row in normalized_df.iterrows():
        data_point = row.values
        
        # En yakın ağırlık vektörünü (BMU) buluyoruz.
        bmu = find_bmu(data_point, weights)
        
        # BMU ve komşu ağırlık vektörlerini güncelliyoruz.
        update_weights(weights, bmu, data_point, learning_rate, sigma)
        
    # Öğrenme hızı ve sigma değerlerini azaltıyoruz.(decay)
    learning_rate *= 0.99
    sigma *= 0.99

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# Kümeleme Sonuçları
- Eğitimimiz tamamlandıktan sonra, her örneği en yakın ağırlığa ( BMU) göre kümeliyoruz.
- Kümeleme sonuçlarımızı " kume-sonuc.txt" dosyasına
yazdırıyoruz.


In [29]:
# Gaussian Bell fonksiyonu
def gaussian_bell(x, mean, sigma):
    return np.exp(-((x - mean)**2) / (2 * (sigma**2)))

# Ağırlık vektörlerini güncelliyoruz.
def update_weights(weights, bmu, data_point, learning_rate, sigma):
    for i in range(weights.shape[0]):
        for j in range(weights.shape[1]):
            distance = euclidean((i, j), bmu)
            h = gaussian_bell(distance, 0, sigma)
            weights[i, j] += h * learning_rate * (data_point - weights[i, j])

# Öklidyen uzaklık
def euclidean(a, b):
    return np.linalg.norm(a - b)

# Best Matching Unit bulma işlemi (BMU)
def find_bmu(data_point, weights):
    min_distance = float('inf')
    bmu = None

    for i in range(weights.shape[0]):
        for j in range(weights.shape[1]):
            weight = weights[i, j].reshape(1, -1)  
            distance = euclidean(data_point, weight)

            if distance < min_distance:
                min_distance = distance
                bmu = (i, j)

    return bmu

# Kümeleme İşlemleri
clusters = []

for index, row in normalized_df.iterrows():
    data_point = row.values  
    data_point = data_point.reshape(1, -1)  

    bmu = find_bmu(data_point, weights)

    # BMU'nun ait olduğu kümenin belirlenmesi işlemi
    grid_size = weights.shape[0]  
    if bmu[0] < grid_size / 2:
        if bmu[1] < grid_size / 2:
            cluster = "C1"
        else:
            cluster = "C2"
    else:
        if bmu[1] < grid_size / 2:
            cluster = "C3"
        else:
            cluster = "C4"

    # 
    clusters.append(cluster)

# Küme bilgilerini oluşturduğumuz dataframe dosyasına ekliyoruz.
normalized_df["Cluster"] = clusters

# Küme bilgilerimizi .txt dosyasına yaz
normalized_df["Cluster"].to_csv("kume-sonuc.txt", index=True, header=False)

# Başarı Oranlarını Hesaplama
- "index.xlsx" dosyasındaki orijinal etiketlerimizle, " kume-sonuc.txt" dosyasındaki atanan kümelerimizi karşılaştırarak, her bir etiketimiz,sınıfımız için kümeleme başarı oranlarını hesaplıyoruz.

In [30]:
# index.xlsx dosyasını okuyup, dataframe haline getiriyoruz.
index_df = pd.read_excel('index.xlsx')

# Küme-sonuc.txt dosyasını okuyup dataframe haline getiriyoruz.
cluster_results = pd.read_csv('kume-sonuc.txt', header=None, names=["instance (record_no)", "Cluster"])

# Yeni oluşturduğumuz df dosyalarını birleştiriyoruz.
merged_df = pd.merge(index_df, cluster_results, left_on='instance (record_no)', right_on='instance (record_no)')

# Hangi kümenin hangi etiketi temsil ettiğini belirleme işlemi
cluster_mapping = {
    "C1": merged_df[merged_df["Cluster"] == "C1"]["label"].mode()[0],
    "C2": merged_df[merged_df["Cluster"] == "C2"]["label"].mode()[0],
    "C3": merged_df[merged_df["Cluster"] == "C3"]["label"].mode()[0],
    "C4": merged_df[merged_df["Cluster"] == "C4"]["label"].mode()[0],
}

# Kümeleme sonuçlarını orijinal etiketlere dönüştürme
predicted_labels = merged_df["Cluster"].replace(cluster_mapping)

# Başarı oranını hesaplama
accuracy_scores = {}

for cluster, label in cluster_mapping.items():
    # Sadece belirli bir küme ve etiketi içeren satırları alıyoruz.
    specific_cluster = merged_df[merged_df["Cluster"] == cluster]
    
    # Kümeleme sonuçlarımızı orijinal etiketlere dönüştürüyoruz.
    predicted_labels = specific_cluster["Cluster"].replace(cluster_mapping)
    
    # Başarı oranını hesaplama işlemi
    accuracy = accuracy_score(specific_cluster["label"], predicted_labels)
    accuracy_scores[cluster] = accuracy

# Her bir küme için başarı oranını yazdırıyoruz.
for cluster, accuracy in accuracy_scores.items():
  print(f"Küme {cluster} için başarı oranı: {accuracy}")

Küme C1 için başarı oranı: 0.3177570093457944
Küme C2 için başarı oranı: 0.2987012987012987
Küme C3 için başarı oranı: 0.2781954887218045
Küme C4 için başarı oranı: 0.2926829268292683
