# K-Means
### Data
* https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz

### Library Import


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

### Data Loading


If the data does not fit in memory, you can keep only the most significant parameters and reduce the number of attributes to 20-30

In [None]:
train_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz')
train_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,doy_108,price_locality_name_median,target
0,105.0,3.0,3,50.0,95000,2.456912,0,2.261905,1
1,40.0,3.0,1,19.200001,25000,3.028689,0,1.0,2
2,37.599998,2.64,0,19.0,26000,3.091993,0,0.619048,2
3,80.0,3.0,3,49.0,35000,3.10101,0,1.25,2
4,100.0,3.0,3,49.0,80000,2.495468,0,1.904762,3


### Sample Splitting


In [None]:
(train, test) = train_test_split(train_data, test_size=0.2)
train = pd.DataFrame(train)
test = pd.DataFrame(test)

### K-Means


In [None]:
clusters_total = 100
kmeans = KMeans(n_clusters=clusters_total, random_state=0, max_iter=100, n_init=10).fit(train)

Calculate the "purity" of clustering: how popular is the most popular class in each cluster

In [None]:
train["target_cluster"] = kmeans.predict(train)

In [None]:
groups = train.groupby(["target_cluster","target"]).count()["total_area"]
clusters_items = [0]*clusters_total
clusters_popular = [0]*clusters_total
clusters_class = [0]*clusters_total
for group in groups.iteritems():
    print (group)
    items = group[1]
    cluster = group[0][0]
    clusters_items[cluster] += items
    if items > clusters_popular[cluster]:
        clusters_popular[cluster] = items
        clusters_class[cluster] = group[0][1]
purity = 0
for i in range(0, clusters_total):
    purity += clusters_popular[i] / clusters_items[i] / clusters_total
print("Purity: ", purity)

((0, 1), 178)
((0, 2), 230)
((0, 3), 647)
((0, 4), 516)
((0, 5), 1016)
((1, 5), 1)
((2, 1), 3201)
((2, 2), 3173)
((2, 3), 2441)
((2, 4), 1391)
((2, 5), 1618)
((3, 5), 2)
((4, 1), 5)
((4, 2), 14)
((4, 3), 12)
((4, 4), 30)
((4, 5), 138)
((5, 1), 7)
((5, 2), 14)
((5, 3), 43)
((5, 4), 60)
((5, 5), 233)
((6, 2), 1)
((7, 1), 1)
((7, 3), 3)
((7, 4), 5)
((7, 5), 14)
((8, 1), 1041)
((8, 2), 1461)
((8, 3), 1615)
((8, 4), 1171)
((8, 5), 1284)
((9, 1), 2)
((10, 2), 1)
((10, 3), 1)
((10, 4), 1)
((10, 5), 5)
((11, 1), 59)
((11, 2), 57)
((11, 3), 127)
((11, 4), 175)
((11, 5), 478)
((12, 2), 8)
((12, 3), 11)
((12, 4), 12)
((12, 5), 57)
((13, 1), 4028)
((13, 2), 5178)
((13, 3), 4726)
((13, 4), 3162)
((13, 5), 5059)
((14, 1), 3)
((14, 2), 2)
((14, 3), 4)
((14, 4), 2)
((14, 5), 5)
((15, 1), 4)
((15, 2), 4)
((15, 3), 15)
((15, 4), 15)
((15, 5), 77)
((16, 1), 1)
((17, 1), 1)
((17, 2), 1)
((18, 1), 229)
((18, 2), 229)
((18, 3), 570)
((18, 4), 444)
((18, 5), 580)
((19, 1), 4121)
((19, 2), 4536)
((19, 3), 379

### Clustering Verification


Use the most popular class of a cluster for the classification of the test sample





In [None]:
test["target_cluster"] = kmeans.predict(test)

In [None]:
test["target_pred"] = test["target_cluster"].apply(lambda x:clusters_class[x])

$$metric = \frac{1}l \sum_{i=0}^{l}{exp^{|prediction_i - target_i|}}$$

In [None]:
print("100-means: ", np.exp(np.abs(test["target_pred"] - test["target"])).sum() / len(test))

In [None]:
print("Benchmark: ", np.exp(np.abs(3 - test["target"])).sum() / len(test))