In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.cluster import KMeans

In [2]:
# Load datasets
occupancy_df_training = pd.read_csv("occupancy_data/datatraining.txt", header=0)

print(occupancy_df_training.head())

occupancy_df_test1 = pd.read_csv("occupancy_data/datatest.txt", header=0)
occupancy_df_test2 = pd.read_csv("occupancy_data/datatest2.txt", header=0)

spambase_df = pd.read_csv("spambase/spambase.data", header=None)

print(spambase_df.head())

                  date  Temperature  Humidity  Light     CO2  HumidityRatio  \
1  2015-02-04 17:51:00        23.18   27.2720  426.0  721.25       0.004793   
2  2015-02-04 17:51:59        23.15   27.2675  429.5  714.00       0.004783   
3  2015-02-04 17:53:00        23.15   27.2450  426.0  713.50       0.004779   
4  2015-02-04 17:54:00        23.15   27.2000  426.0  708.25       0.004772   
5  2015-02-04 17:55:00        23.10   27.2000  426.0  704.50       0.004757   

   Occupancy  
1          1  
2          1  
3          1  
4          1  
5          1  
     0     1     2    3     4     5     6     7     8     9  ...    48     49  \
0  0.00  0.64  0.64  0.0  0.32  0.00  0.00  0.00  0.00  0.00 ...  0.00  0.000   
1  0.21  0.28  0.50  0.0  0.14  0.28  0.21  0.07  0.00  0.94 ...  0.00  0.132   
2  0.06  0.00  0.71  0.0  1.23  0.19  0.19  0.12  0.64  0.25 ...  0.01  0.143   
3  0.00  0.00  0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63 ...  0.00  0.137   
4  0.00  0.00  0.00  0.0  0.63

In [3]:
occupancy_y = np.array(occupancy_df_training.Occupancy)

occupancy_df_training = occupancy_df_training.drop("date", axis=1)
occupancy_df_training = np.array(occupancy_df_training.drop("Occupancy", axis=1))

spambase_y = np.array(spambase_df[57])
spambase_df = np.array(spambase_df.drop(57, axis=1))

print(occupancy_y)
print(occupancy_df_training)
print(spambase_y)

[1 1 1 ... 1 1 1]
[[2.31800000e+01 2.72720000e+01 4.26000000e+02 7.21250000e+02
  4.79298818e-03]
 [2.31500000e+01 2.72675000e+01 4.29500000e+02 7.14000000e+02
  4.78344095e-03]
 [2.31500000e+01 2.72450000e+01 4.26000000e+02 7.13500000e+02
  4.77946352e-03]
 ...
 [2.11000000e+01 3.60950000e+01 4.33000000e+02 7.98500000e+02
  5.59563902e-03]
 [2.11000000e+01 3.62600000e+01 4.33000000e+02 8.20333333e+02
  5.62144937e-03]
 [2.11000000e+01 3.62000000e+01 4.47000000e+02 8.21000000e+02
  5.61206354e-03]]
[1 1 1 ... 0 0 0]


In [4]:
def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return float(np.sum(np.amax(contingency_matrix, axis=0))) / float(np.sum(contingency_matrix))

In [5]:
kmeans = KMeans(n_clusters=2)

X_train, X_test, y_train, y_test = train_test_split(occupancy_df_training, occupancy_y, test_size=0.33, random_state=53)

kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [6]:
kmeans = KMeans(n_clusters=4)

X_train, X_test, y_train, y_test = train_test_split(occupancy_df_training, occupancy_y, test_size=0.33, random_state=53)

kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [13]:
kmeans = KMeans(n_clusters=8)

X_train, X_test, y_train, y_test = train_test_split(occupancy_df_training, occupancy_y, test_size=0.33, random_state=53)

kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [None]:
correct = 0
for i in range(len(X_train)):
    predict_me = np.array(X_train[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = kmeans.predict(predict_me)
    if prediction[0] == y_train[i]:
        correct += 1

print(float(correct)/float(len(X_train)))

In [14]:
y_pred = kmeans.predict(X_test)
print(metrics.f1_score(y_test, y_pred, average='micro'))

0.03869047619047619


In [15]:
print(purity_score(y_test, y_pred))

0.971726190476


In [16]:
kmeans = KMeans(n_clusters=2)

X_train, X_test, y_train, y_test = train_test_split(spambase_df, spambase_y, test_size=0.33, random_state=53)

kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [17]:
kmeans = KMeans(n_clusters=4)

X_train, X_test, y_train, y_test = train_test_split(spambase_df, spambase_y, test_size=0.33, random_state=53)

kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [20]:
kmeans = KMeans(n_clusters=8)

X_train, X_test, y_train, y_test = train_test_split(spambase_df, spambase_y, test_size=0.33, random_state=53)

kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [None]:
correct = 0
for i in range(len(X_train)):
    predict_me = np.array(X_train[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = kmeans.predict(predict_me)
    if prediction[0] == y_train[i]:
        correct += 1

print(float(correct)/float(len(X_train)))

In [21]:
y_pred = kmeans.predict(X_test)
print(metrics.f1_score(y_test, y_pred, average='micro'))

0.5444371296905859


In [22]:
print(purity_score(y_test, y_pred))

0.706385780118
