### Data
* https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz

### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

### Loading data

In [None]:
train_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz')
train_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,doy_108,price_locality_name_median,target
0,105.0,3.0,3,50.0,95000,2.456912,0,2.261905,1
1,40.0,3.0,1,19.200001,25000,3.028689,0,1.0,2
2,37.599998,2.64,0,19.0,26000,3.091993,0,0.619048,2
3,80.0,3.0,3,49.0,35000,3.10101,0,1.25,2
4,100.0,3.0,3,49.0,80000,2.495468,0,1.904762,3


### Data normalization
Normalizing all values to the segment [0;1]

In [None]:
train_data_mm = pd.DataFrame(MinMaxScaler().fit_transform(train_data))

In [None]:
train_data_mm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.097782,0.333333,0.6,0.108352,0.002078,0.390327,0.0,0.002071,0.0
1,0.032258,0.333333,0.2,0.038826,0.000522,0.599951,0.0,0.000893,0.25
2,0.029839,0.213333,0.0,0.038375,0.000544,0.62316,0.0,0.000538,0.25
3,0.072581,0.333333,0.6,0.106095,0.000744,0.626465,0.0,0.001127,0.25
4,0.092742,0.333333,0.6,0.106095,0.001745,0.404462,0.0,0.001738,0.5


### Affinity Propagation
We choose a sufficiently large number of iterations and damping because the algorithm is prone to oscillations. A large amount of memory is required.

In [None]:
aff = AffinityPropagation(copy=False, verbose=True, random_state=42,
                          convergence_iter=15, damping=0.9, max_iter=5000)
aff.fit(train_data_mm[0:10000])

Converged after 100 iterations.


AffinityPropagation(copy=False, damping=0.9, max_iter=5000, random_state=42,
                    verbose=True)

In [None]:
print ("Unlabeled data:", str(round(100*len(aff.labels_[aff.labels_==-1])/len(train_data_mm[0:10000])))+"%")

Неразмеченные данные: 0%


In [None]:
print(1 - metrics.davies_bouldin_score(train_data_mm[0:10000], aff.labels_)) # 0.19

0.18486049101218083


In [None]:
print (np.unique(aff.labels_))

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112]


In [None]:
train_data.iloc[list(np.where(aff.labels_==0)[0])]

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,doy_108,price_locality_name_median,target
52,52.0,2.74,2,30.0,43000,2.742424,0,1.02381,3
120,50.0,2.7,2,32.0,24000,2.585903,0,0.8,3
153,52.0,2.64,2,30.0,45000,2.437738,0,1.071429,3
4046,44.0,2.7,2,28.0,25000,2.491228,0,1.086957,3
4101,66.0,2.7,2,30.0,26000,2.591772,0,1.130435,3
4268,60.0,2.64,2,40.0,30000,2.722222,0,1.153846,3
4402,44.0,2.48,2,30.0,25000,2.570048,0,0.595238,3
4653,60.0,2.74,2,35.0,30000,2.662921,0,0.714286,3
5581,60.0,2.64,2,32.0,47000,2.586441,0,1.119048,3
5750,45.0,2.64,2,33.0,45000,2.721495,0,1.071429,3
