# Gaussian Mixture Model
### Data
* https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz

### Library Import


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture

### Data Loading


In [None]:
train_data = pd.read_csv('https://video.ittensive.com/machine-learning/hacktherealty/exposition_train.basic.csv.gz')
train_data.head()

Unnamed: 0,total_area,ceiling_height,rooms,living_area,price,day_mean,doy_108,price_locality_name_median,target
0,105.0,3.0,3,50.0,95000,2.456912,0,2.261905,1
1,40.0,3.0,1,19.200001,25000,3.028689,0,1.0,2
2,37.599998,2.64,0,19.0,26000,3.091993,0,0.619048,2
3,80.0,3.0,3,49.0,35000,3.10101,0,1.25,2
4,100.0,3.0,3,49.0,80000,2.495468,0,1.904762,3


### Sample Splitting


In [None]:
(train, test) = train_test_split(train_data, test_size=0.2)
train = pd.DataFrame(train)
test = pd.DataFrame(test)

### Gaussian Mixture

Identifying normally distributed subsets from the data



In [None]:
clusters_total = 100
gmm = GaussianMixture(n_components=clusters_total, random_state=0,
                      verbose=2, max_iter=100, n_init=5).fit(train)

Initialization 0
  Iteration 10	 time lapse 184.40539s	 ll change 1.07318
  Iteration 20	 time lapse 168.77093s	 ll change 0.09888
  Iteration 30	 time lapse 185.61171s	 ll change 0.24080
  Iteration 40	 time lapse 210.46708s	 ll change 0.01188
  Iteration 50	 time lapse 229.41799s	 ll change 0.00061
Initialization converged: True	 time lapse 978.67411s	 ll -2.11253
Initialization 1
  Iteration 10	 time lapse 205.88659s	 ll change 0.45162
  Iteration 20	 time lapse 227.02486s	 ll change 0.11236
  Iteration 30	 time lapse 262.08291s	 ll change 0.02074
  Iteration 40	 time lapse 253.25231s	 ll change 0.00186
Initialization converged: True	 time lapse 972.99748s	 ll -3.01838
Initialization 2
  Iteration 10	 time lapse 312.10462s	 ll change 0.94808
  Iteration 20	 time lapse 268.00880s	 ll change 0.11301
  Iteration 30	 time lapse 212.54573s	 ll change 0.02201
  Iteration 40	 time lapse 189.65110s	 ll change 0.01164
Initialization converged: True	 time lapse 1058.42388s	 ll -2.03476
Initia

Calculate the "purity" of clustering: how popular is the most popular class in each cluster

In [None]:
train["target_cluster"] = gmm.predict(train)

In [None]:
groups = train.groupby(["target_cluster","target"]).count()["total_area"]
clusters_items = [0]*clusters_total
clusters_popular = [0]*clusters_total
clusters_class = [0]*clusters_total
for group in groups.iteritems():
    print (group)
    items = group[1]
    cluster = group[0][0]
    clusters_items[cluster] += items
    if items > clusters_popular[cluster]:
        clusters_popular[cluster] = items
        clusters_class[cluster] = group[0][1]
purity = 0
for i in range(0, clusters_total):
    purity += clusters_popular[i] / clusters_items[i] / clusters_total
print("Purity: ", purity) # 0.51



((0, 1), 701)
((0, 2), 1100)
((0, 3), 1001)
((0, 4), 702)
((0, 5), 1027)
((1, 5), 1)
((2, 1), 4)
((2, 5), 2)
((3, 1), 93)
((3, 2), 103)
((3, 3), 189)
((3, 4), 260)
((4, 1), 1)
((4, 2), 1)
((5, 1), 12)
((5, 2), 20)
((5, 3), 29)
((5, 4), 67)
((5, 5), 223)
((6, 1), 795)
((6, 2), 1118)
((6, 3), 1194)
((6, 4), 820)
((6, 5), 940)
((7, 2), 3)
((7, 3), 1)
((7, 4), 4)
((7, 5), 3)
((8, 3), 1)
((9, 1), 679)
((9, 2), 849)
((9, 3), 1388)
((9, 4), 1435)
((10, 2), 1)
((10, 3), 3)
((10, 4), 7)
((10, 5), 8)
((11, 5), 1)
((12, 1), 236)
((12, 2), 258)
((12, 3), 619)
((12, 4), 548)
((12, 5), 992)
((13, 1), 2)
((13, 2), 1)
((13, 3), 6)
((13, 4), 1)
((13, 5), 11)
((14, 1), 3489)
((14, 2), 3927)
((14, 3), 3148)
((14, 4), 1813)
((14, 5), 2124)
((15, 3), 1)
((16, 1), 1)
((16, 2), 1)
((16, 3), 1)
((16, 4), 2)
((16, 5), 2)
((17, 1), 17)
((17, 2), 14)
((17, 3), 33)
((17, 4), 52)
((17, 5), 183)
((18, 1), 8300)
((18, 2), 11393)
((18, 3), 10493)
((18, 4), 6787)
((18, 5), 11164)
((19, 1), 2)
((19, 4), 1)
((19, 5), 1)

### Clustering Verification


Use the most popular class of a cluster for the classification of the test sample



In [None]:
test["target_cluster"] = gmm.predict(test)

In [None]:
test["target_pred"] = test["target_cluster"].apply(lambda x:clusters_class[x])

$$metric = \frac{1}l \sum_{i=0}^{l}{exp^{|prediction_i - target_i|}}$$

In [None]:
print ("GMM: ", np.exp(np.abs(test["target_pred"] - test["target"])).sum() / len(test)) # 7

GMM:  6.987936168218524


In [None]:
print("Benchmark: ", np.exp(np.abs(3 - test["target"])).sum() / len(test))