# Spectral clustering
### Data
* https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz

### Importing libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import SpectralClustering
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

### Loading data

In [None]:
train_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz')
train_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,doy_108,price_locality_name_median,target
0,105.0,3.0,3,50.0,95000,2.456912,0,2.261905,1
1,40.0,3.0,1,19.200001,25000,3.028689,0,1.0,2
2,37.599998,2.64,0,19.0,26000,3.091993,0,0.619048,2
3,80.0,3.0,3,49.0,35000,3.10101,0,1.25,2
4,100.0,3.0,3,49.0,80000,2.495468,0,1.904762,3


### Data normalization
Normalizing all values to the segment [0;1]

In [None]:
train_data_mm = pd.DataFrame(MinMaxScaler().fit_transform(train_data))

In [None]:
train_data_mm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.097782,0.333333,0.6,0.108352,0.002078,0.390327,0.0,0.002071,0.0
1,0.032258,0.333333,0.2,0.038826,0.000522,0.599951,0.0,0.000893,0.25
2,0.029839,0.213333,0.0,0.038375,0.000544,0.62316,0.0,0.000538,0.25
3,0.072581,0.333333,0.6,0.106095,0.000744,0.626465,0.0,0.001127,0.25
4,0.092742,0.333333,0.6,0.106095,0.001745,0.404462,0.0,0.001738,0.5


### Spectral clustering
Looking for eigenvectors (directions) in a Kirchhoff matrix (data proximity graph). Memory demanding.

In [None]:
sp = SpectralClustering(n_clusters=100, random_state=42).fit(train_data_mm[0:10000])

In [None]:
print(1 - metrics.davies_bouldin_score(train_data_mm[0:10000], sp.labels_)) # -0.2

-0.22534629168323583


In [None]:
print (np.unique(sp.labels_))

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]


In [None]:
train_data.iloc[list(np.where(sp.labels_==sp.labels_[0])[0])]

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,doy_108,price_locality_name_median,target
0,105.0,3.00,3,50.0,95000,2.456912,0,2.261905,1
129,78.0,3.00,3,51.0,75000,2.830472,0,1.785714,1
178,154.0,3.00,4,78.0,200000,2.495298,0,4.761905,1
2240,92.0,2.80,3,53.0,45000,2.827214,0,1.500000,1
2450,98.0,2.78,4,66.0,80000,3.170843,0,1.904762,1
...,...,...,...,...,...,...,...,...,...
9173,120.0,3.00,3,80.0,130000,2.980952,0,3.095238,1
9190,120.0,3.00,3,90.0,70000,2.980952,0,1.666667,1
9192,80.0,3.20,3,50.0,100000,2.980952,0,2.380952,1
9636,164.0,3.00,3,90.0,19751,3.017632,0,0.470262,1
