# Q1. Write a Python code to implement the KNN classifier algorithm on load_iris dataset in sklearn.datasets.

In [13]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a KNN classifier
k = 3  # Choose the number of neighbors (you can adjust this value)
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Fit the classifier to the training data
knn_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_classifier.predict(X_test)

# Calculate and print the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


# Q2. Write a Python code to implement the KNN regressor algorithm on load_boston dataset in sklearn.datasets.

In [12]:
# Import necessary libraries
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Load the Boston Housing dataset
boston = load_boston()
X = boston.data  # Features
y = boston.target  # Target values (house prices)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a KNN regressor
k = 3  # Choose the number of neighbors (you can adjust this value)
knn_regressor = KNeighborsRegressor(n_neighbors=k)

# Fit the regressor to the training data
knn_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_regressor.predict(X_test)

# Calculate and print the Mean Squared Error (MSE) of the regressor
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


# Q3. Write a Python code snippet to find the optimal value of K for the KNN classifier algorithm using cross-validation on load_iris dataset in sklearn.datasets.

In [7]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

# Load the Iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target labels

# Create a list of K values to experiment with
k_values = list(range(1, 21))  # You can adjust the range as needed

# Create an empty dictionary to store K-value and corresponding mean cross-validation scores
k_scores = {}

# Perform cross-validation for each K value
for k in k_values:
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn_classifier, X, y, cv=5, scoring='accuracy')  # 5-fold cross-validation
    k_scores[k] = scores.mean()

# Find the K value with the highest mean cross-validation score
best_k = max(k_scores, key=k_scores.get)
best_score = k_scores[best_k]

print(f"Best K: {best_k}")
print(f"Best Mean Cross-Validation Score: {best_score:.4f}")

Best K: 6
Best Mean Cross-Validation Score: 0.9800


# Q4. Implement the KNN regressor algorithm with feature scaling on load_boston dataset in sklearn.datasets.

In [11]:
# Import necessary libraries
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load the Boston Housing dataset
boston = load_boston()
X = boston.data  # Features
y = boston.target  # Target values (house prices)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a StandardScaler to scale features
scaler = StandardScaler()

# Fit the scaler to the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a KNN regressor
k = 3  # Choose the number of neighbors (you can adjust this value)
knn_regressor = KNeighborsRegressor(n_neighbors=k)

# Fit the regressor to the scaled training data
knn_regressor.fit(X_train_scaled, y_train)

# Make predictions on the scaled test set
y_pred = knn_regressor.predict(X_test_scaled)

# Calculate and print the Mean Squared Error (MSE) of the regressor
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


# Q5. Write a Python code snippet to implement the KNN classifier algorithm with weighted voting on load_iris dataset in sklearn.datasets.

In [10]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a KNN classifier with weighted voting
k = 3  # Choose the number of neighbors (you can adjust this value)
weights = 'distance'  # Use distance-based weighting

knn_classifier = KNeighborsClassifier(n_neighbors=k, weights=weights)

# Fit the classifier to the training data
knn_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_classifier.predict(X_test)

# Calculate and print the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


# Q6. Implement a function to standardise the features before applying KNN classifier.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def knn_classifier_with_standardization(X, y, test_size=0.2, k=3):
    """
    Apply K-Nearest Neighbors (KNN) classifier with feature standardization.
    
    Parameters:
    - X: Features (numpy array or DataFrame)
    - y: Target labels (numpy array or Series)
    - test_size: Proportion of the dataset to include in the test split (default is 0.2)
    - k: Number of neighbors to consider (default is 3)
    
    Returns:
    - Accuracy score on the test set
    """
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Create a StandardScaler to standardize the features
    scaler = StandardScaler()
    
    # Fit the scaler to the training data and transform both training and testing data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Create a KNN classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=k)

    # Fit the classifier to the scaled training data
    knn_classifier.fit(X_train_scaled, y_train)

    # Make predictions on the scaled test set
    y_pred = knn_classifier.predict(X_test_scaled)

    # Calculate and return the accuracy of the classifier
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Example usage:
accuracy = knn_classifier_with_standardization(X, y, test_size=0.2, k=3)
print(f"Accuracy: {accuracy:.2f}")

This function takes the features X, target labels y, optional arguments for test set size (test_size) and the number of neighbors (k). It performs feature standardization using StandardScaler, fits a KNN classifier to the standardized training data, makes predictions on the standardized test data, and returns the accuracy score of the classifier on the test set.

You can use this function by passing your feature matrix X and target vector y along with optional arguments to specify the test set size and the number of neighbors (k). Uncomment the example usage at the end of the code to see how to use the function.

# Q7. Write a Python function to calculate the euclidean distance between two points.

In [None]:
import numpy as np

def euclidean_distance(point1, point2):
    """
    Calculate the Euclidean distance between two points (vectors).

    Parameters:
    - point1: First point (numpy array or list)
    - point2: Second point (numpy array or list)

    Returns:
    - Euclidean distance between the two points
    """
    # Convert the points to numpy arrays (if they are not already)
    point1 = np.array(point1)
    point2 = np.array(point2)

    # Ensure that the points have the same dimensionality
    if point1.shape != point2.shape:
        raise ValueError("Both points must have the same dimensionality")

    # Calculate the Euclidean distance
    distance = np.sqrt(np.sum((point1 - point2) ** 2))
    return distance

# Example usage:
point1 = [1, 2, 3]
point2 = [4, 5, 6]
distance = euclidean_distance(point1, point2)
print(f"Euclidean Distance: {distance:.2f}")

# Q8. Write a Python function to calculate the manhattan distance between two points.

In [None]:
import numpy as np

def manhattan_distance(point1, point2):
    """
    Calculate the Manhattan distance between two points (vectors).

    Parameters:
    - point1: First point (numpy array or list)
    - point2: Second point (numpy array or list)

    Returns:
    - Manhattan distance between the two points
    """
    # Convert the points to numpy arrays (if they are not already)
    point1 = np.array(point1)
    point2 = np.array(point2)

    # Ensure that the points have the same dimensionality
    if point1.shape != point2.shape:
        raise ValueError("Both points must have the same dimensionality")

    # Calculate the Manhattan distance
    distance = np.sum(np.abs(point1 - point2))
    return distance

# Example usage:
point1 = [1, 2, 3]
point2 = [4, 5, 6]
distance = manhattan_distance(point1, point2)
print(f"Manhattan Distance: {distance:.2f}")