# W2 Practicals - Supervised Learning
### (k-NN and Decision Trees)

### Aims:
* To apply k-NN and decision trees as examples of supervised learning models for classification and regression problems.
* To get some insight into the trained models, including the influence of a hyperparameter in an ML model.
* To produce some assessable work for this subject.



In [None]:
# Common Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# ANSI color
class color:
  YELLOW_BOLD = '\033[1;33m'
  END = '\033[0m'

### Q1 Make scatterplots of each dataset so you can see what they look like.

In [None]:
# Sub-Plot Setup
fig, axes = plt.subplots(1, 2, figsize=(12, 10))
fig.subplots_adjust(wspace=15)
fig.lines.append(plt.Line2D([0.5, 0.5], [0, 1], transform=fig.transFigure, color='black', linewidth=1))

# 1st Sub-Plot for W3Classif
CSV_FILE_1 = 'w3classif.csv'
df1 = pd.read_csv(CSV_FILE_1, header=None)
df1.columns = ["Feature1", "Feature2", "Label"]

sns.scatterplot(x=df1["Feature1"],
                y=df1["Feature2"],
                hue=df1["Label"],
                palette="coolwarm",
                alpha=0.7,
                ax=axes[0])
axes[0].set_title("Scatterplot of W3Classif")
axes[0].set_xlabel("Feature 1")
axes[0].set_ylabel("Feature 2")

# 2nd Sub-Plot for W3Regr
CSV_FILE_2 = 'w3regr.csv'
df2 = pd.read_csv(CSV_FILE_2, header=None)
df2.columns = ["Feature1", "Feature2"]

sns.scatterplot(x=df2["Feature1"],
                y=df2["Feature2"],
                alpha=0.7,
                ax=axes[1])
axes[1].set_title("Scatterplot with W3Regr")
axes[1].set_xlabel("Feature 1")
axes[1].set_ylabel("Feature 2")
plt.tight_layout()
# plt.save

### Q2 Randomly shuffle the datasets (i.e. the order of the rows) and split them each into 70% (for training) and 30% (for testing).

In [None]:
# Specific Imports
from sklearn.model_selection import train_test_split

In [None]:
X, y = df1.iloc[:, :-1].values, df1.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Q3

In [None]:
# Specific Imports
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from matplotlib.colors import ListedColormap

#### a. Build a k-NN classifier with k = 3 for dataset w3classif.csv and find the training and test loss (i.e. misclassification rate).

In [None]:
def train_and_predict_knn(X_train, y_train, X_test, y_test, n_neighbors=3):
  # Make a KNN Classifier and Train it on Dataset
  knn = KNeighborsClassifier(n_neighbors=3)
  knn.fit(X_train, y_train)

  # Making Predictions
  y_train_pred = knn.predict(X_train)
  y_test_pred = knn.predict(X_test)
  return knn, y_train_pred, y_test_pred

def evaluate_knn(y_train, y_train_pred, y_test, y_test_pred):
  # Misclassification rate
  train_loss = 1 - accuracy_score(y_train, y_train_pred)
  test_loss = 1 - accuracy_score(y_test, y_test_pred)

  print(f"""
  Misclassification Rate (in %):
    Train Loss: {train_loss*100}%
    Test Loss: {test_loss*100}%
  """)

knn, y_train_pred, y_test_pred = train_and_predict_knn(X_train, y_train, X_test, y_test)
evaluate_knn(y_train, y_train_pred, y_test, y_test_pred)

#### b. Plot the decision regions for your classifier together with the training and/or test data points.

In [None]:
def plot_decision_regions(X, y, X_train, y_train, X_test, y_test):
  knn, _, _ = train_and_predict_knn(X_train, y_train, X_test, y_test)

  # Finding Ranges of Feature1 (f1) and Feature2 (f2), and a feature space
  f1_min, f1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
  f2_min, f2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
  f1r, f2r = np.meshgrid(np.arange(f1_min, f1_max, 0.1),
                         np.arange(f2_min, f2_max, 0.1))

  # Predicting class labels for grid points
  Z = knn.predict(np.c_[f1r.ravel(), f2r.ravel()])
  Z = Z.reshape(f1r.shape)

  # Plotting Decision Boundaries
  plt.figure(figsize=(8, 6))
  plt.contourf(f1r, f2r, Z, alpha=0.3,
               cmap=ListedColormap(['red', 'blue']))
  plt.scatter(X_train[:, 0], X_train[:, 1],
              c=y_train, cmap=ListedColormap(['darkred', 'darkgreen', 'darkblue']),
              edgecolor='k', label='Train')
  plt.scatter(X_test[:, 0], X_test[:, 1],
              c=y_test, cmap=ListedColormap(['pink', 'lightgreen', 'lightblue']),
              edgecolor='k', marker='s', label='Test')
  plt.title('k-NN Decision Regions')
  plt.xlabel('Feature 1')
  plt.ylabel('Feature 2')
  plt.legend()
  plt.show()

plot_decision_regions(X, y, X_train, y_train, X_test, y_test)

### c. Experiment with different k values and see how it affects the loss values and the decision regions.

In [None]:
for i in range(1,3):
  knn, y_train_pred, y_test_pred = train_and_predict_knn(X_train, y_train, X_test, y_test, n_neighbors=i)
  print(f"{color.YELLOW_BOLD}Evaluation and Decision Regions for {i}NN{color.END}")
  evaluate_knn(y_train, y_train_pred, y_test, y_test_pred)
  plot_decision_regions(X, y, X_train, y_train, X_test, y_test)

## **References**:
1. “matplotlib.pyplot.subplots — Matplotlib 3.6.0 documentation,” matplotlib.org. https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.subplots.html
2. scikit-learn, “sklearn.neighbors.KNeighborsClassifier — scikit-learn 0.22.1 documentation,” Scikit-learn.org, 2019. https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
3.