# Classic machine learning three

## Data preprocessing

In [52]:
from scipy.cluster.vq import whiten

data = [5,1,3,3,2,3,3,8,1,2,2,3,5]

scaled_data = whiten(data)
print(scaled_data)

[2.72733941 0.54546788 1.63640365 1.63640365 1.09093577 1.63640365
 1.63640365 4.36374306 0.54546788 1.09093577 1.09093577 1.63640365
 2.72733941]


In [55]:
import matplotlib.pyplot as plt

plt.plot(data, label = 'original')
plt.plot(scaled_data, label = 'scaled')

# show legend
plt.legend()

<matplotlib.legend.Legend at 0x7fee766e84d0>

## Hierarchical clustering

In [8]:
import pandas as pd

x_coordinate = [17, 20, 35, 14, 37, 33, 14, 30, 35, 17, 11, 21, 13, 10, 81, 84, 87, 83, 
                90, 97, 94, 88, 89, 93, 92, 82, 81, 92, 91, 22, 23, 25, 25, 27, 7, 17]
y_coordinate = [4, 6, 0, 0, 4, 3, 1, 6, 5, 4, 6, 10, 8, 10, 97, 94, 99, 95, 95, 97, 99, 
                99, 94, 99, 90, 98, 100, 93, 98, 15, 10, 0, 10, 7, 17, 15]

comic_con = pd.DataFrame({
    'x_coordinate': x_coordinate,
    'y_coordinate': y_coordinate,
    'x_scaled': whiten(x_coordinate),
    'y_scaled': whiten(y_coordinate)
})

comic_con.head()

Unnamed: 0,x_coordinate,y_coordinate,x_scaled,y_scaled
0,17,4,0.504652,0.09001
1,20,6,0.593708,0.135015
2,35,0,1.038989,0.0
3,14,0,0.415596,0.0
4,37,4,1.09836,0.09001


In [9]:
comic_con.describe()

Unnamed: 0,x_coordinate,y_coordinate,x_scaled,y_scaled
count,36.0,36.0,36.0,36.0
mean,49.361111,44.111111,1.465305,0.992609
std,34.164437,45.069963,1.014185,1.014185
min,7.0,0.0,0.207798,0.0
25%,19.25,5.75,0.571444,0.129389
50%,34.0,12.5,1.009304,0.281281
75%,87.25,95.5,2.590052,2.148985
max,97.0,100.0,2.879484,2.250246


**method is the class spacing measurement**
* ward
* single
* complete

**metric is the sample spacing measurement**

In [16]:
from scipy.cluster.hierarchy import linkage, fcluster
import seaborn as sns

In [17]:
# cal
distance_matrix = linkage(comic_con[['x_scaled', 'y_scaled']], method='ward', metric='euclidean')

# tag
comic_con['cluster_labels'] = fcluster(distance_matrix, 2, criterion = 'maxclust')

fig = plt.figure(figsize = (8, 8))
sns.scatterplot(x = 'x_scaled', y = 'y_scaled', hue='cluster_labels', data=comic_con)

<AxesSubplot:xlabel='x_scaled', ylabel='y_scaled'>

In [18]:
# dendrogram
from scipy.cluster.hierarchy import dendrogram

dn = dendrogram(distance_matrix)

In [19]:
# time
import random, timeit

points = 100
df = pd.DataFrame({
    'x': random.sample(range(0, points), points),
    'y': random.sample(range(0, points), points)
})

%timeit linkage(df[['x', 'y']], method='ward', metric = 'euclidean')

340 µs ± 7.93 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## FIFA 18

In [22]:
fifa = pd.read_csv('../resource/fifa_18_dataset.csv')
fifa.info()
fifa.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17994 entries, 0 to 17993
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   sliding_tackle  17994 non-null  int64
 1   aggression      17994 non-null  int64
dtypes: int64(2)
memory usage: 281.3 KB


Unnamed: 0,sliding_tackle,aggression
0,23,63
1,26,48
2,33,56
3,38,78
4,11,29


In [23]:
fifa['scaled_sliding_tackle'] = whiten(fifa['sliding_tackle'])
fifa['scaled_aggression']  = whiten(fifa['aggression'])
fifa.describe()

Unnamed: 0,sliding_tackle,aggression,scaled_sliding_tackle,scaled_aggression
count,17994.0,17994.0,17994.0,17994.0
mean,45.592086,55.828109,2.122855,3.198381
std,21.477372,17.455601,1.000028,1.000028
min,4.0,11.0,0.186248,0.630188
25%,24.0,43.0,1.117486,2.463461
50%,52.0,59.0,2.42122,3.380098
75%,64.0,69.0,2.979963,3.952996
max,91.0,96.0,4.237135,5.49982


In [24]:
distance_matrix = linkage(fifa[['scaled_sliding_tackle', 'scaled_aggression']], 'ward')

In [25]:
# dendrogram
dn = dendrogram(distance_matrix)

In [26]:
# tag
fifa['cluster_labels'] = fcluster(distance_matrix, 3, criterion = 'maxclust')

In [27]:
# classifiter
print(fifa[['scaled_sliding_tackle', 'scaled_aggression', 'cluster_labels']].groupby('cluster_labels').mean())

                scaled_sliding_tackle  scaled_aggression
cluster_labels                                          
1                            0.987373           1.849142
2                            3.013487           4.063492
3                            1.934455           3.210802


In [28]:
fig = plt.figure(figsize = (8,8))
sns.scatterplot(x = 'scaled_sliding_tackle', y = 'scaled_aggression', hue='cluster_labels', data = fifa)

<AxesSubplot:xlabel='scaled_sliding_tackle', ylabel='scaled_aggression'>

## KMeans

In [30]:
comic_con.head()

Unnamed: 0,x_coordinate,y_coordinate,x_scaled,y_scaled,cluster_labels
0,17,4,0.504652,0.09001,2
1,20,6,0.593708,0.135015,2
2,35,0,1.038989,0.0,2
3,14,0,0.415596,0.0,2
4,37,4,1.09836,0.09001,2


In [35]:
from scipy.cluster.vq import kmeans, vq

# 
cluster_centers, distortion = kmeans(comic_con[['x_scaled', 'y_scaled']], k_or_guess = 2, iter = 20, thresh = 1e-05)

#
comic_con['cluster_labels_kmeans'], distortion_list = vq(comic_con[['x_scaled', 'y_scaled']], cluster_centers)

fig = plt.figure(figsize = (8, 8))
sns.scatterplot(x = 'x_scaled', y = 'y_scaled', hue = 'cluster_labels_kmeans', data = comic_con)

<AxesSubplot:xlabel='x_scaled', ylabel='y_scaled'>

In [37]:
distortion, distortion_list

(0.20563643185482766,
 array([0.14881634, 0.04933993, 0.42630446, 0.27082289, 0.46205804,
        0.34940557, 0.25894343, 0.25072129, 0.4004947 , 0.14881634,
        0.31422848, 0.0758576 , 0.25608587, 0.35136963, 0.21604753,
        0.13828631, 0.06829044, 0.15978865, 0.06112957, 0.25953016,
        0.17948955, 0.05755324, 0.05962236, 0.15163452, 0.18291303,
        0.18920128, 0.22990027, 0.13552721, 0.08817158, 0.18688251,
        0.08523523, 0.18217145, 0.12579944, 0.16127754, 0.49058941,
        0.23060569]))

In [38]:
distortion_list.mean()

0.20563643185482766

In [39]:
%timeit linkage(fifa[['scaled_sliding_tackle', 'scaled_aggression']], 'ward')

5.23 s ± 348 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
%timeit kmeans(fifa[['scaled_sliding_tackle', 'scaled_aggression']], k_or_guess = 3)

206 ms ± 12.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [41]:
# elbow

distortions = []
num_clusters = range(1, 7)

for i in num_clusters:
    cluster_centers, distortion = kmeans(comic_con[['x_scaled', 'y_scaled']], i)
    distortions.append(distortion)

elbow_plot = pd.DataFrame({'num_clusters': num_clusters, 'distortions': distortions})

fig = plt.figure(figsize = (8, 8))
sns.lineplot(x = 'num_clusters', y = 'distortions', data = elbow_plot)

<AxesSubplot:xlabel='num_clusters', ylabel='distortions'>

## DBSCAN

In [43]:
from sklearn.datasets import make_blobs
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

centers = [[1, 0.5], [2, 2], [1, -1]]
stds = [0.1, 0.4, 0.3]

X, labels_true = make_blobs(n_samples = 1000, centers = centers, cluster_std = stds, random_state = 0)

fig = plt.figure(figsize = (10, 10))
sns.scatterplot(X[:, 0], X[:, 1], hue = ['cluster - {}'.format(x) for x in labels_true])

<AxesSubplot:>

In [44]:
from sklearn.cluster import DBSCAN
db = DBSCAN(eps = 0.5, min_samples = 10).fit(X)

labels = db.labels_
fig = plt.figure(figsize = (10, 10))
sns.scatterplot(X[:, 0], X[:, 1], hue = ['cluster - {}'.format(x) for x in labels])

<AxesSubplot:>

In [45]:
fig = plt.figure(figsize = (20, 10))
fig.subplots_adjust(hspace = .5, wspace=.2)

i = 1
for x in range(10, 0, -1):
    eps = 1/(11-x)
    db = DBSCAN(eps = eps, min_samples = 10).fit(X)
    # core_samples_mask = np.zeros_like(db.labels_, dtype = bool)
    # core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    print(eps)

    ax = fig.add_subplot(2, 5, i)
    ax.text(1, 4, 'eps = {}'.format(round(eps, 3)), fontsize = 25, ha = 'center')
    sns.scatterplot(X[:, 0], X[:, 1], hue = ['cluster - {}'.format(x) for x in labels])

    i += 1

1.0
0.5
0.3333333333333333
0.25
0.2
0.16666666666666666
0.14285714285714285
0.125
0.1111111111111111
0.1


## KMeans and DBSCAN

In [46]:
from sklearn.cluster import KMeans

# 
X, y = make_blobs(random_state= 170, n_samples=600, centers = 5)

rng = np.random.RandomState(74)

#
transformation = rng.normal(size = (2, 2))
X = np.dot(X, transformation)

# plot
fig = plt.figure(figsize = (10, 10))
sns.scatterplot(X[:, 0], X[:, 1], hue = ['cluster - {}'.format(x) for x in y])





<AxesSubplot:>

In [47]:
# kmeans

kmeans = KMeans(n_clusters = 5)
kmeans.fit(X)

y_pred = kmeans.predict(X)

In [49]:
fig = plt.figure(figsize = (10, 10))
sns.scatterplot(X[:, 0], X[:, 1], hue = ['cluster - {}'.format(x) for x in y_pred], palette='Set2')
sns.scatterplot(kmeans.cluster_centers_[:, 0],
                kmeans.cluster_centers_[:, 1],
                marker = '^',
                hue = [0, 1, 2, 3, 4],
                s = 500,
                palette='Set2')

<AxesSubplot:>

In [50]:
# DBSCAN
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_scaled = ss.fit_transform(X)

dbscan = DBSCAN(eps = 0.123, min_samples = 2)
clusters = dbscan.fit_predict(X_scaled)

fig = plt.figure(figsize = (10, 10))
sns.scatterplot(X[:, 0], X[:, 1], hue = ['cluster - {}'.format(x) for x in clusters], palette = 'Set2')

<AxesSubplot:>

In [51]:
from sklearn.metrics.cluster import adjusted_rand_score

# k-means performance:
print('ARI = ', round(adjusted_rand_score(y, y_pred), 2))

# DBSCAN performance:
print('ARI = ', round(adjusted_rand_score(y, clusters), 2))

ARI =  0.76
ARI =  0.99
