## Task no 1. 
Create a function named KNNClassifier that take input the value of K. Use Euclidean 
Distance measure to find the distance between two datapoints. (Create a separate 
function that will take two instances and return the distance) Create a function Predict and 
make predictions using the function we created for KNN classification. Call the function 
with different values of the K and print actual and predicted values.


In [2]:
import numpy as np

# Function to calculate Euclidean Distance between two instances
def euclidean_distance(instance1, instance2):
    # Calculate Euclidean distance
    return np.sqrt(np.sum((instance1 - instance2) ** 2))


In [3]:
from collections import Counter

class KNNClassifier:
    def __init__(self, k=3):
        self.k = k  # Number of neighbors

    def fit(self, X_train, y_train):
        # Store training data
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        # Predict the class for each test instance
        predictions = [self._predict(instance) for instance in X_test]
        return predictions

    def _predict(self, instance):
        # Calculate distances between the test instance and all training instances
        distances = [euclidean_distance(instance, train_instance) for train_instance in self.X_train]
        
        # Get the indices of the K nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        
        # Get the labels of the K nearest neighbors
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        
        # Return the most common class label among the K neighbors
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]


In [8]:
import seaborn as sns
from sklearn.model_selection import train_test_split

# Load the tips dataset
tips = sns.load_dataset('tips')

# Prepare the features and target variables
# Use 'total_bill' and 'size' as features, 'tip' as the target
X = tips[['total_bill', 'size']].values
y = (tips['tip'] > 5).astype(int).values  # Binary target: 1 if tip > 5, else 0

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

# Instantiate the KNN classifier
knn = KNNClassifier(k=3)

# Fit the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Print the actual and predicted values
for actual, predicted in zip(y_test, y_pred):
    print(f"Actual: {actual}, Predicted: {predicted}")


Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predicted: 0
Actual: 0, Predi

In [12]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Task no 2. 
Implement the K mean clustering algorithm. Use different value of K to evaluate your code

In [9]:
class KMeansClustering:
    def __init__(self, k=3, max_iters=100, tol=1e-4):
        self.k = k
        self.max_iters = max_iters
        self.tol = tol  # Tolerance for convergence
    
    def fit(self, X):
        # Initialize centroids randomly
        np.random.seed(42)
        centroids = X[np.random.choice(X.shape[0], self.k, replace=False)]
        
        for _ in range(self.max_iters):
            # Step 1: Assign labels to each point based on closest centroid
            labels = self._assign_labels(X, centroids)
            
            # Step 2: Calculate new centroids
            new_centroids = self._update_centroids(X, labels)
            
            # Step 3: Check for convergence (if centroids don't change significantly)
            if np.all(np.abs(new_centroids - centroids) < self.tol):
                break
                
            centroids = new_centroids
        
        self.centroids = centroids
        self.labels = labels
    
    def _assign_labels(self, X, centroids):
        # Calculate distance from each point to each centroid and assign the label
        distances = np.array([np.linalg.norm(X - centroid, axis=1) for centroid in centroids])
        return np.argmin(distances, axis=0)
    
    def _update_centroids(self, X, labels):
        # Recalculate centroids by averaging the points in each cluster
        return np.array([X[labels == i].mean(axis=0) for i in range(self.k)])


In [10]:
# Use 'total_bill' and 'tip' as features
X = tips[['total_bill', 'tip']].values

# Try K-means with different values of K
k_values = [2, 3, 4]

for k in k_values:
    kmeans = KMeansClustering(k=k)
    kmeans.fit(X)
    
    print(f"\nK-Means Clustering with K={k}:")
    print(f"Centroids:\n{kmeans.centroids}")
    print(f"Labels: {kmeans.labels[:10]}")  # Display first 10 labels for brevity



K-Means Clustering with K=2:
Centroids:
[[31.45132353  4.19147059]
 [15.27886364  2.53727273]]
Labels: [1 1 1 0 0 0 1 0 1 1]

K-Means Clustering with K=3:
Centroids:
[[22.22193182  3.37988636]
 [13.11710744  2.2568595 ]
 [36.71628571  4.602     ]]
Labels: [1 1 0 0 0 0 1 0 1 1]

K-Means Clustering with K=4:
Centroids:
[[17.9532967   2.90747253]
 [11.35181818  2.02038961]
 [26.748       3.73490909]
 [40.41857143  5.04809524]]
Labels: [0 1 0 2 2 2 1 2 0 0]
