# K-Means Clustering

In diesem Notebook besprechen wir das Vorgehen von K-Means Clustering

# Loading Packages and Data

In [27]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import plotly.express as px

In [28]:
X_train = pd.read_csv('Xtrain_feature_sel.csv')
X_test = pd.read_csv('Xtest_feature_sel.csv')
y_train = pd.read_csv('ytrain_mod.csv')
y_test = pd.read_csv('ytest_mod.csv')
print("Shape of X Train: {}".format(X_train.shape))
print("Shape of X Test: {}".format(X_test.shape))
print("Shape of y Train: {}".format(y_train.shape))
print("Shape of y Test: {}".format(y_test.shape))

Shape of X Train: (8672, 19)
Shape of X Test: (2168, 19)
Shape of y Train: (8672, 1)
Shape of y Test: (2168, 1)


In [29]:
# loading the data
data = pd.read_csv("googleplaystore.csv")
X = data.drop(columns = ['Installs'])
y = data['Installs']
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0)

# Binding Target value with explaining variables

In [30]:
train = pd.concat([X_train.reset_index(drop=True), y_train], axis=1)
train.head()

Unnamed: 0,Reviews,Rating,days_since_update,Price,Category_FAMILY,Category_GAME,Category_Rare,Category_TOOLS,Size_Varies with device,Size_0.1-10MB,Size_10.1-20MB,Size_30.1-40MB,Size_20.1-30MB,Type_Free,Content Rating_Everyone,Content Rating_Teen,Genres_Entertainment,Genres_Rare,Genres_Education,Installs
0,-0.144827,0.403271,-0.603588,0.0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1000000
1,-0.150937,-1.053274,0.637297,0.0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,50000
2,-0.151044,1.651738,3.887354,0.0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,100
3,-0.150943,1.027505,-0.613697,0.0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,10000
4,-0.122954,0.611349,-0.570733,0.0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,5000000


In [31]:
train.shape

(8672, 20)

In [32]:
test = pd.concat([X_test.reset_index(drop=True), y_test], axis=1)
test.shape

(2168, 20)

In [33]:
train_orig = pd.concat([X_train_orig.reset_index(drop=True), y_train_orig], axis=1)
train_orig.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Installs
0,Extreme- Personal Voice Assistant,PRODUCTIVITY,4.4,25627,9.8M,Free,0,Everyone,Productivity,"August 5, 2018",132,5.0 and up,"10,000+"
1,PRIMATURE.CD,NEWS_AND_MAGAZINES,,6,4.8M,Free,0,Everyone,News & Magazines,"October 30, 2017",6.3.7.1,4.1 and up,"500,000+"
2,Flights,TRAVEL_AND_LOCAL,4.4,18039,3.1M,Free,0,Everyone,Travel & Local,"July 26, 2018",3.6.6,4.1 and up,"5,000,000+"
3,Doodle Jump,GAME,4.3,1083571,Varies with device,Free,0,Everyone,Arcade,"July 20, 2018",Varies with device,Varies with device,"50,000,000+"
4,BL 1-Click Camera - Free,PHOTOGRAPHY,3.5,52,1.8M,Free,0,Everyone,Photography,"August 18, 2016",2.0.160818,2.3 and up,"100,000+"


# Model

In [34]:
km = KMeans(
    n_clusters=3, init='random',
    n_init=10, max_iter=300,
    tol=1e-04, random_state=0
)
km.fit(train)

# Cluster Centers

In [35]:
cluster_centers = pd.DataFrame(km.cluster_centers_, columns=train.columns)
cluster_centers

Unnamed: 0,Reviews,Rating,days_since_update,Price,Category_FAMILY,Category_GAME,Category_Rare,Category_TOOLS,Size_Varies with device,Size_0.1-10MB,Size_10.1-20MB,Size_30.1-40MB,Size_20.1-30MB,Type_Free,Content Rating_Everyone,Content Rating_Teen,Genres_Entertainment,Genres_Rare,Genres_Education,Installs
0,4.608411,0.204826,-0.617464,3.330669e-16,0.037037,0.138889,1.387779e-17,0.092593,0.703704,5.5511150000000004e-17,0.092593,0.027778,0.027778,1.0,0.703704,0.212963,0.018519,-2.775558e-17,6.938894e-18,717592600.0
1,0.97032,0.372758,-0.480676,4.440892e-16,0.097173,0.318021,0.008833922,0.067138,0.507067,0.0459364,0.063604,0.060071,0.045936,1.0,0.70318,0.180212,0.042403,0.1484099,-5.5511150000000004e-17,79505300.0
2,-0.130896,-0.029145,0.042354,0.9982783,0.190548,0.089522,0.06576644,0.08052,0.125656,0.4034759,0.164166,0.059515,0.115029,0.91848,0.810203,0.106527,0.06139,0.1570393,0.05676419,1828860.0


# Predict

In [36]:
train_pred = train
train_pred['cluster_predict'] = km.predict(train)

In [37]:
train_pred.shape

(8672, 21)

In [38]:
train_pred.head()

Unnamed: 0,Reviews,Rating,days_since_update,Price,Category_FAMILY,Category_GAME,Category_Rare,Category_TOOLS,Size_Varies with device,Size_0.1-10MB,...,Size_30.1-40MB,Size_20.1-30MB,Type_Free,Content Rating_Everyone,Content Rating_Teen,Genres_Entertainment,Genres_Rare,Genres_Education,Installs,cluster_predict
0,-0.144827,0.403271,-0.603588,0.0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,1000000,2
1,-0.150937,-1.053274,0.637297,0.0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,50000,2
2,-0.151044,1.651738,3.887354,0.0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,0,100,2
3,-0.150943,1.027505,-0.613697,0.0,1,0,0,0,0,0,...,0,0,1,1,0,1,0,0,10000,2
4,-0.122954,0.611349,-0.570733,0.0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,5000000,2


In [39]:
test_pred = test
test_pred['cluster_predict'] = km.predict(test)
test_pred.shape

(2168, 21)

In [40]:
train_orig_pred = train_orig
train_orig_pred = pd.concat([train_orig_pred.reset_index(drop=True),train_pred[['cluster_predict']]], axis=1)

In [41]:
train[['cluster_predict']]

Unnamed: 0,cluster_predict
0,2
1,2
2,2
3,2
4,2
...,...
8667,2
8668,2
8669,2
8670,2


## Interpreting Results

In [42]:
train[['cluster_predict']].groupby('cluster_predict').value_counts()

cluster_predict
0     108
1     566
2    7998
dtype: int64

In [43]:
test[['cluster_predict']].groupby('cluster_predict').value_counts()

cluster_predict
0      22
1     132
2    2014
dtype: int64

In [44]:
train.groupby('cluster_predict').describe()

Unnamed: 0_level_0,Reviews,Reviews,Reviews,Reviews,Reviews,Reviews,Reviews,Reviews,Rating,Rating,...,Genres_Education,Genres_Education,Installs,Installs,Installs,Installs,Installs,Installs,Installs,Installs
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
cluster_predict,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,108.0,4.608411,6.373268,-0.052963,0.797163,2.668269,5.0184,26.98958,108.0,0.204826,...,0.0,0.0,108.0,717592600.0,249046300.0,500000000.0,500000000.0,500000000.0,1000000000.0,1000000000.0
1,566.0,0.97032,1.535016,-0.147714,0.167233,0.440598,1.393313,15.438445,566.0,0.372758,...,0.0,0.0,566.0,79505300.0,24612450.0,50000000.0,50000000.0,100000000.0,100000000.0,100000000.0
2,7998.0,-0.130896,0.062199,-0.151048,-0.151038,-0.150646,-0.140755,0.923326,7998.0,-0.029145,...,0.0,1.0,7998.0,1828860.0,3349557.0,0.0,1000.0,100000.0,1000000.0,10000000.0


In [45]:
res = train.groupby('cluster_predict').describe()
res = pd.DataFrame(res)
res.to_csv("Results_Clustering.csv",index=False)

In [46]:
train_orig_pred.groupby('cluster_predict').describe()

Unnamed: 0_level_0,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
cluster_predict,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0.0,82.0,4.113415,0.494604,2.6,4.0,4.2,4.4,5.0
1.0,447.0,4.216107,0.498392,1.5,4.1,4.3,4.5,5.0
2.0,6039.0,4.186471,0.522347,1.0,4.0,4.3,4.5,5.0


In [47]:
def my_boxplot_fct(data,variable):
    fig = px.box(data, x="cluster_predict", y=variable)
    fig.show()

In [48]:
my_boxplot_fct(train_orig_pred, "Price")

In [49]:
my_boxplot_fct(train_orig_pred, "Rating")

In [50]:
my_boxplot_fct(train_orig_pred, "Content Rating")

# Visualizing Model


Für die Resultate zu visualisieren müssen wir die vielen Variablen auf eine 2-dimensionale Darstellung bringen. Dafür gibt es so zu sagen Dimension Reduktionsverfahren. Hier verwenden wir das t-SNE Verfahren. Wir gehen nun nicht weiter in die Details ein.

In [51]:
#execudting the tsne method, set random state that you can get the same results as me
tsne = TSNE(random_state=42)
# use fit_transform instead of fit, as TSNE has no transform method
train_tsne = tsne.fit_transform(train)
# saving data as pandas data frame
df_tsne = pd.DataFrame(train_tsne, columns = ['TSNE_DIM1', 'TSNE_DIM2'])
#df_tsne.describe()
# joining the tsne transformed data to the original data
train_pred = pd.concat([train_pred,df_tsne], axis = 1)
train_pred.describe()

Unnamed: 0,Reviews,Rating,days_since_update,Price,Category_FAMILY,Category_GAME,Category_Rare,Category_TOOLS,Size_Varies with device,Size_0.1-10MB,...,Type_Free,Content Rating_Everyone,Content Rating_Teen,Genres_Entertainment,Genres_Rare,Genres_Education,Installs,cluster_predict,TSNE_DIM1,TSNE_DIM2
count,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,...,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0
mean,3.6666040000000004e-17,2.041827e-15,-3.5949100000000005e-17,0.920691,0.182542,0.105051,0.061232,0.079797,0.157749,0.375115,...,0.924815,0.801891,0.112661,0.059617,0.15452,0.052352,15812640.0,1.909825,-1.221263,-0.392644
std,1.000058,1.000058,1.000058,14.406743,0.386312,0.306636,0.239768,0.270994,0.364527,0.484181,...,0.263704,0.398598,0.316197,0.23679,0.361468,0.22275,85991560.0,0.327053,48.35215,40.112862
min,-0.151048,-6.671377,-0.6566602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-111.749687,-93.995682
25%,-0.1510348,-0.2209628,-0.6086422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,5000.0,2.0,-37.825469,-29.280224
50%,-0.1502967,0.195193,-0.4645884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,100000.0,2.0,-0.387628,-0.572252
75%,-0.1316782,0.6113488,0.162804,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,5000000.0,2.0,34.511871,24.655109
max,26.98958,1.651738,6.927648,399.99,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1000000000.0,2.0,102.789688,92.023438


Unnamed: 0,Reviews,Rating,days_since_update,Price,Category_FAMILY,Category_GAME,Category_Rare,Category_TOOLS,Size_Varies with device,Size_0.1-10MB,...,Type_Free,Content Rating_Everyone,Content Rating_Teen,Genres_Entertainment,Genres_Rare,Genres_Education,Installs,cluster_predict,TSNE_DIM1,TSNE_DIM2
count,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,...,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0,8672.0
mean,3.6666040000000004e-17,2.041827e-15,-3.5949100000000005e-17,0.920691,0.182542,0.105051,0.061232,0.079797,0.157749,0.375115,...,0.924815,0.801891,0.112661,0.059617,0.15452,0.052352,15812640.0,1.909825,-1.221263,-0.392644
std,1.000058,1.000058,1.000058,14.406743,0.386312,0.306636,0.239768,0.270994,0.364527,0.484181,...,0.263704,0.398598,0.316197,0.23679,0.361468,0.22275,85991560.0,0.327053,48.35215,40.112862
min,-0.151048,-6.671377,-0.6566602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-111.749687,-93.995682
25%,-0.1510348,-0.2209628,-0.6086422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,5000.0,2.0,-37.825469,-29.280224
50%,-0.1502967,0.195193,-0.4645884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,100000.0,2.0,-0.387628,-0.572252
75%,-0.1316782,0.6113488,0.162804,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,5000000.0,2.0,34.511871,24.655109
max,26.98958,1.651738,6.927648,399.99,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1000000000.0,2.0,102.789688,92.023438


In [52]:
# representing the data
fig = px.scatter(train_pred, x="TSNE_DIM1", y="TSNE_DIM2",color="cluster_predict",
                 labels= {
                     "TSNE_DIM1": "Dimension 1",
                     "TSNE_DIM2": "Dimension 2",
                     "cluster_predict": "Number of associated cluster"
                 },
                 title= "Representation of the dimension reduction by t-SNE")
fig.show()