In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [18]:
data = pd.read_csv("1601384279_9602122_iris_new_data.txt", sep=' ', names=['sepal_len','sepal_wid','petal_len','petal_wid'])
data

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid
0,5.7,4.4,1.5,0.4
1,5.5,4.2,1.4,0.2
2,5.2,4.1,1.5,0.1
3,5.8,4.0,1.2,0.2
4,5.4,3.9,1.7,0.4
...,...,...,...,...
145,4.5,2.3,1.3,0.3
146,6.0,2.2,5.0,1.5
147,6.2,2.2,4.5,1.5
148,6.0,2.2,4.0,1.0


In [19]:
data.isnull().sum()

sepal_len    0
sepal_wid    0
petal_len    0
petal_wid    0
dtype: int64

In [20]:
# normalization
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [21]:
data = normalize(data)
data.iloc[3]

sepal_len    0.416667
sepal_wid    0.833333
petal_len    0.033898
petal_wid    0.041667
Name: 3, dtype: float64

In [22]:
from sklearn.metrics.pairwise import euclidean_distances

In [23]:
def initialize_centroids(data, k):
    # no of attributes
    n = np.shape(data)[1]    
    # initialize centroids as zero matrices    
    centroids = np.mat(np.zeros((k,n)))    
    # choose random centroids
#     for j in range(k):
#         centroids[j,:] = data.iloc[np.random.randint(np.shape(data)[0])]

#     initialize one random centroid
    centroids[0,:] = data.iloc[np.random.randint(np.shape(data)[0])]

    dist_euc = euclidean_distances(centroids[0,:], data)
    indexes = np.argsort(dist_euc)
    print(type(indexes))
    
    print(data.iloc[indexes[0,-1]])
    centroids[1,:] = data.iloc[indexes[0,-1]]
    centroids[2,:] = data.iloc[indexes[0,-2]]
        
    return centroids

In [24]:
init_centroids = initialize_centroids(data, k=3)
init_centroids

<class 'numpy.ndarray'>
sepal_len    0.000000
sepal_wid    0.416667
petal_len    0.016949
petal_wid    0.000000
Name: 92, dtype: float64


matrix([[0.72222222, 0.45833333, 0.69491525, 0.91666667],
        [0.        , 0.41666667, 0.01694915, 0.        ],
        [0.08333333, 0.66666667, 0.        , 0.04166667]])

In [25]:
import scipy.spatial.distance as metric

In [26]:
def euclidean_dist(A, B):
    
    return metric.euclidean(A, B)

In [27]:
euclidean_dist(init_centroids[0], init_centroids[1])

1.350280290238991

In [28]:
def cluster(data, k):

    # Number of rows in dataset
    m = np.shape(data)[0]

    # Hold the instance cluster assignments
    cluster_assignments = np.mat(np.zeros((m, 2)))

    # Initialize centroids
    cents = initialize_centroids(data, k)
    
    # Preserve original centroids
    cents_orig = cents.copy()
    
    changed = True
    num_iter = 0

    # Loop until no changes to cluster assignments
    while changed:

        changed = False

        # For every instance (row in dataset)
        for i in range(m):

            # Track minimum distance, and vector index of associated cluster
            min_dist = np.inf
            min_index = -1

            # Calculate distances
            for j in range(k):

                dist_ji = euclidean_dist(cents[j,:], data.iloc[i,:])
                if dist_ji < min_dist:
                    min_dist = dist_ji
                    min_index = j

            # Check if cluster assignment of instance has changed
            if cluster_assignments[i, 0] != min_index: 
                changed = True

            # Assign instance to appropriate cluster
            cluster_assignments[i, :] = min_index, min_dist**2

        # Update centroid location
        for p in range(k):
            indices = [i for i, x in enumerate(cluster_assignments) if x[0,0] == int(p)]
            cents[p] = np.mean(data.iloc[indices], axis=0)
            
        
        # Count iterations
        num_iter += 1
        print(cents)
    
#     print(type(cluster_assignments[0,0]))
#     print(cluster_assignments.shape)
#     print(cluster_assignments[:,0])

    # Return important stuff when done
    return cents, cluster_assignments, num_iter, cents_orig

In [29]:
final_centroids, cluster_ass, iterno, original_centroids = cluster(data, k=3)
array = np.squeeze(np.asarray(cluster_ass[:,0])).astype(np.int64) + 1
array

<class 'numpy.ndarray'>
sepal_len    0.944444
sepal_wid    0.250000
petal_len    1.000000
petal_wid    0.916667
Name: 126, dtype: float64
[[0.24603175 0.4745671  0.22650231 0.20238095]
 [0.60309829 0.34375    0.71414602 0.70512821]
 [0.66666667 0.54563492 0.7409201  0.78174603]]
[[0.21296296 0.525      0.13813559 0.11666667]
 [0.50268817 0.32997312 0.63504647 0.61155914]
 [0.72718254 0.49702381 0.80266344 0.84821429]]
[[0.19611111 0.59083333 0.07864407 0.06      ]
 [0.45996732 0.31372549 0.58973081 0.56495098]
 [0.72569444 0.46875    0.81567797 0.8515625 ]]
[[0.19611111 0.59083333 0.07864407 0.06      ]
 [0.45269097 0.3125     0.58024364 0.55273438]
 [0.70910494 0.4537037  0.80743879 0.84143519]]
[[0.19611111 0.59083333 0.07864407 0.06      ]
 [0.44623656 0.31048387 0.57654456 0.54905914]
 [0.70614035 0.4495614  0.8015165  0.83223684]]
[[0.19611111 0.59083333 0.07864407 0.06      ]
 [0.44125683 0.30737705 0.57571548 0.54918033]
 [0.70726496 0.4508547  0.79704476 0.82478632]]
[[0.196111

array([1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 1, 1, 3, 3,
       3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1,
       1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2])

In [30]:
ans = array.copy()

In [31]:
#             print(indices)
#             centroid_test = np.mean(data[indices], axis = 1)
#             print(centroid_test)

In [32]:
ans

array([1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 1, 1, 3, 3,
       3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1,
       1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2])

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3, random_state=101, algorithm='full')
ans = model.fit_predict(data)
print(ans)

In [None]:
model.inertia_

In [None]:
# to adjust the label size
ans = ans + 1

In [33]:
ans

array([1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 1, 1, 3, 3,
       3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1,
       1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2])

In [34]:
len(ans)

150

In [36]:
with open('iris_pred_values.txt', 'w') as filehandle:
    for listitem in ans:
        filehandle.write('%s\n' % listitem)
    print("Wrote",len(ans),"items successfully to iris_pred_values.txt")

Wrote 150 items successfully to iris_pred_values.txt
