In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv("z_photo_data.csv").drop(columns=['z_gal'])

In [2]:
df.iGrID.nunique()

479

# Наша цель - кластеризовать галактики, используя только фотометрические данные (optic_color a.k.a. |dered_g - dered_r|, ra, dec). В качестве ground truth будет выступать iGrID. Нам нужно получить 479 кластеров.

In [3]:
train = df[['ra','dec','optic_color']]

# Попробуем перебрать несколько алгоритмов кластеризации. Для оценки будем использовать V-measure и Fowlkes-Mallows score.
## Параметры были оптимально подобраны заранее

In [4]:
from sklearn import cluster
from sklearn.metrics import homogeneity_completeness_v_measure, fowlkes_mallows_score

def eval(name, pred):
    print(name)
    print("Homogeneity: %.3f \nCompleteness: %.3f \nV-measure: %.3f" % homogeneity_completeness_v_measure(df.iGrID, pred))
    print("Fowlkes-Mallows score: %.3f" % fowlkes_mallows_score(df.iGrID, pred))

In [5]:
from sklearn import cluster
from sklearn.metrics import homogeneity_completeness_v_measure, fowlkes_mallows_score

k_means = cluster.KMeans(n_clusters=479)
pred = k_means.fit_predict(train)
eval("K-Means", pred)

KMeans
Homogeneity: 0.975 
Completeness: 0.958 
V-measure: 0.966
Fowlkes-Mallows score: 0.659


In [13]:
bandwidth =  cluster.estimate_bandwidth(train, quantile=0.001)
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
pred = ms.fit_predict(train)
eval("Mean Shift", pred)

Mean Shift
Homogeneity: 0.982 
Completeness: 0.951 
V-measure: 0.966
Fowlkes-Mallows score: 0.705


In [19]:
dbscan = cluster.DBSCAN(eps=0.4)
pred = dbscan.fit_predict(train)
eval("DBSCAN", pred)

DBSCAN
Homogeneity: 0.958 
Completeness: 0.989 
V-measure: 0.973
Fowlkes-Mallows score: 0.814


In [20]:
birch = cluster.Birch(n_clusters=479)
pred = birch.fit_predict(train)
eval("BIRCH", pred)

BIRCH
Homogeneity: 0.965 
Completeness: 0.962 
V-measure: 0.963
Fowlkes-Mallows score: 0.667
