# Clustering through clusters
1. Let's select characteristic clusters in the data via SOM
2. use the most popular class as the only one for the cluster
3. Calculate the clustering accuracy

### Data
* https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz

### Connecting libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn_som.som import SOM
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

### Loading Data

In [None]:
train_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz')
train_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,doy_108,price_locality_name_median,target
0,105.0,3.0,3,50.0,95000,2.456912,0,2.261905,1
1,40.0,3.0,1,19.200001,25000,3.028689,0,1.0,2
2,37.599998,2.64,0,19.0,26000,3.091993,0,0.619048,2
3,80.0,3.0,3,49.0,35000,3.10101,0,1.25,2
4,100.0,3.0,3,49.0,80000,2.495468,0,1.904762,3


### Data normalization
Normalize all values to the segment [0;1], remove the target from the data

In [None]:
train_data_mm = pd.DataFrame(MinMaxScaler().fit_transform(train_data[train_data.columns[:-1]]))

In [None]:
train_data_mm.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.097782,0.333333,0.6,0.108352,0.002078,0.390327,0.0,0.002071
1,0.032258,0.333333,0.2,0.038826,0.000522,0.599951,0.0,0.000893
2,0.029839,0.213333,0.0,0.038375,0.000544,0.62316,0.0,0.000538
3,0.072581,0.333333,0.6,0.106095,0.000744,0.626465,0.0,0.001127
4,0.092742,0.333333,0.6,0.106095,0.001745,0.404462,0.0,0.001738


### Self Organizing Maps
Let's get clusters on all data

In [None]:
np.random.seed(42)
som = SOM(m=40, n=40, dim=len(train_data_mm.columns), max_iter=1000)
som_labels = som.fit_predict(np.array(train_data_mm), epochs=100, shuffle=False)

Assign the data to a cluster, return target

In [None]:
train_data_mm["label"] = som_labels
train_data_mm["target"] = train_data["target"]

### Class Prediction
Selecting the most popular class in the cluster

In [None]:
groups = train_data_mm.groupby(["label","target"]).count()[0]
clusters_popular = [0]*len(som_labels)
clusters_class = [0]*len(som_labels)
for group in groups.iteritems():
    items = group[1]
    cluster = group[0][0]
    if items > clusters_popular[cluster]:
        clusters_popular[cluster] = items
        clusters_class[cluster] = group[0][1]
train_data_mm["target_pred"] = train_data_mm["label"].apply(lambda x:clusters_class[x])

In [None]:
train_data_mm.head(30)

Unnamed: 0,0,1,2,3,4,5,6,7,label,target,target_pred
0,0.097782,0.333333,0.6,0.108352,0.002078,0.390327,0.0,0.002071,1281,1,5
1,0.032258,0.333333,0.2,0.038826,0.000522,0.599951,0.0,0.000893,1127,2,2
2,0.029839,0.213333,0.0,0.038375,0.000544,0.62316,0.0,0.000538,1370,2,2
3,0.072581,0.333333,0.6,0.106095,0.000744,0.626465,0.0,0.001127,1322,2,5
4,0.092742,0.333333,0.6,0.106095,0.001745,0.404462,0.0,0.001738,1281,3,5
5,0.029234,0.213333,0.2,0.029345,0.000522,0.528112,0.0,0.000516,1290,2,1
6,0.02621,0.266667,0.2,0.038826,0.0003,0.65497,0.0,0.00096,1288,3,3
7,0.052419,0.266667,0.4,0.063205,0.000611,0.546054,0.0,0.00119,1367,3,3
8,0.052419,0.333333,0.4,0.072235,0.001478,0.643747,0.0,0.001471,1205,2,5
9,0.02621,0.25,0.2,0.038826,0.000344,0.604935,0.0,0.00065,1288,2,3


### Prediction Quality Assessment

In [None]:
print ("SOM: ", np.exp(np.abs(train_data_mm["target_pred"] - train_data_mm["target"])).sum() / len(train_data_mm))

SOM:  7.467475430187634


In [None]:
print ("Benchmark: ", np.exp(np.abs(3 - train_data_mm["target"])).sum() / len(train_data_mm))

In [None]:
print ("Benchmark by day: ", np.exp(np.abs(round(train_data["day_mean"]) - train_data["target"])).sum() / len(train_data))