# LAB 6 YH 

In [2]:
import pandas as pd
import numpy as np

# Generate random sample data for the income and age columns
np.random.seed(42)

# Number of samples
num_samples = 100

# Generate random 'income' and 'age' data
income = np.random.randint(30000, 100000, num_samples)  # Random income between 30k and 100k
age = np.random.randint(18, 70, num_samples)  # Random age between 18 and 70

# Create DataFrame
df = pd.DataFrame({'income': income, 'age': age})

# Save DataFrame as CSV
df.to_csv('income.csv', index=False)

# Show the first few rows of the dataset
print(df.head())


   income  age
0   45795   68
1   30860   49
2   84886   56
3   36265   66
4   67194   69


In [4]:
import pandas as pd
from sklearn.cluster import KMeans

# Load the dataset
data = pd.read_csv('income.csv')  # Ensure that income.csv is in the same directory

# Select relevant features for clustering
X = data[['income', 'age']]  # Adjust this based on your actual columns

# Apply K-Means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)

# Get the cluster labels and centroids
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

# Add the cluster labels to the dataset
data['Cluster'] = labels

# Calculate new centroids based on the mean values of each cluster
new_centroids = data.groupby('Cluster')[['income', 'age']].mean()

# Output the original and new centroids
print("Original Centroids based on KMeans algorithm:")
print(centroids)

print("\nNew Centroids based on mean values of data instances in each cluster:")
print(new_centroids)




Original Centroids based on KMeans algorithm:
[[6.36207188e+04 4.80000000e+01]
 [3.85517097e+04 4.52903226e+01]
 [9.04600811e+04 4.21891892e+01]]

New Centroids based on mean values of data instances in each cluster:
               income        age
Cluster                         
0        63620.718750  48.000000
1        38551.709677  45.290323
2        90460.081081  42.189189


In [5]:
# lab 6 TASK 2
import numpy as np

# Data (Objects with their X, Y, Z coordinates)
data = {
    'OB-1': [1, 4, 1],
    'OB-2': [1, 2, 2],
    'OB-3': [1, 4, 2],
    'OB-4': [2, 1, 2],
    'OB-5': [1, 1, 1],
    'OB-6': [2, 4, 2],
    'OB-7': [1, 1, 2],
    'OB-8': [2, 1, 1]
}

# Convert data into a list of coordinates
coordinates = list(data.values())

# Step 1: Initialize centroids randomly (let's choose two points manually)
# Initial centroids (we can select points manually for simplicity)
centroid_1 = coordinates[0]  # Initial centroid 1 (OB-1)
centroid_2 = coordinates[3]  # Initial centroid 2 (OB-4)

# Function to calculate Euclidean distance between two points
def euclidean_distance(point1, point2):
    return np.sqrt(sum((np.array(point1) - np.array(point2)) ** 2))

# Function to update the centroid by calculating the mean of points in the cluster
def calculate_new_centroid(cluster_points):
    return np.mean(cluster_points, axis=0).tolist()

# Step 2: Assign objects to the nearest centroid
def assign_clusters(centroid_1, centroid_2, coordinates):
    clusters = {0: [], 1: []}  # Two clusters (0 and 1)
    for i, point in enumerate(coordinates):
        dist_to_c1 = euclidean_distance(point, centroid_1)
        dist_to_c2 = euclidean_distance(point, centroid_2)
        if dist_to_c1 < dist_to_c2:
            clusters[0].append(point)
        else:
            clusters[1].append(point)
    return clusters

# Step 3: K-Means Clustering (Iterate until convergence)
def kmeans(coordinates, centroid_1, centroid_2, max_iterations=100):
    for _ in range(max_iterations):
        # Step 2: Assign points to clusters
        clusters = assign_clusters(centroid_1, centroid_2, coordinates)
        
        # Step 3: Calculate new centroids
        new_centroid_1 = calculate_new_centroid(clusters[0])
        new_centroid_2 = calculate_new_centroid(clusters[1])
        
        # Check for convergence (if centroids don't change)
        if new_centroid_1 == centroid_1 and new_centroid_2 == centroid_2:
            break
        
        # Update centroids
        centroid_1 = new_centroid_1
        centroid_2 = new_centroid_2
        
    return clusters, centroid_1, centroid_2

# Run K-Means algorithm
clusters, final_centroid_1, final_centroid_2 = kmeans(coordinates, centroid_1, centroid_2)

# Display results
print("Final Centroids:")
print("Centroid 1:", final_centroid_1)
print("Centroid 2:", final_centroid_2)

print("\nCluster 1 (Points assigned to Centroid 1):")
for idx in range(len(clusters[0])):
    print("OB-" + str(clusters[0].index(clusters[0][idx]) + 1), clusters[0][idx])

print("\nCluster 2 (Points assigned to Centroid 2):")
for idx in range(len(clusters[1])):
    print("OB-" + str(clusters[1].index(clusters[1][idx]) + 1), clusters[1][idx])


Final Centroids:
Centroid 1: [1.3333333333333333, 4.0, 1.6666666666666667]
Centroid 2: [1.4, 1.2, 1.6]

Cluster 1 (Points assigned to Centroid 1):
OB-1 [1, 4, 1]
OB-2 [1, 4, 2]
OB-3 [2, 4, 2]

Cluster 2 (Points assigned to Centroid 2):
OB-1 [1, 2, 2]
OB-2 [2, 1, 2]
OB-3 [1, 1, 1]
OB-4 [1, 1, 2]
OB-5 [2, 1, 1]
