# Probabilistic k-NN Model (Using Only NumPy)

In this notebook, we will:

1. Load `dataset.csv` (containing `launch_speed, launch_angle, target`).
2. Build a k-NN model from scratch, using only NumPy.
3. Output a probability distribution across 8 possible classes:
   - single
   - double
   - triple
   - home run
   - groundoutable
   - flyoutable
   - lineoutable

In [1]:
import numpy as np
from collections import Counter
import warnings

# We will assume the following 8 classes:
ALL_CLASSES = [
    'Single',
    'Double',
    'Triple',
    'Home Run',
    'Groundoutable',
    'Flyoutable',
    'Lineoutable']

# Load numeric columns (launch_speed, launch_angle)
# from dataset.csv using NumPy.

data_numeric = np.genfromtxt(
    'dataset.csv',       # CSV filename/path
    delimiter=',',
    skip_header=1,       # skip header row
    usecols=(0, 1),
    dtype=float
)

# Load the target (third column) as strings:
data_labels = np.genfromtxt(
    'dataset.csv',
    delimiter=',',
    skip_header=1,
    usecols=(2),
    dtype=str
)

def knn_predict_probabilities(query_point, features, labels, k=3):
    """
    Given:
      query_point: (launch_speed, launch_angle)
      features:    Nx2 array of numeric data
      labels:      Nx1 array of string labels
      k:           number of neighbors to consider

    Return:
      A dictionary mapping each class in ALL_CLASSES to a probability,
      with special handling:
        - If features is empty (N=0), returns all 0 probabilities and warns.
        - If N < k, uses N as k and warns.
    """
    N = len(features)
    if N == 0:
        warnings.warn("No data points available. Returning all probabilities = 0.", UserWarning)
        # Return a dict of 0.0 for all classes
        return {cls: 0.0 for cls in ALL_CLASSES}

    if N < k:
        warnings.warn(
            f"Number of data points (N={N}) is less than k={k}. Using N={N} instead.",
            UserWarning
        )
        k = N

    # Convert query_point to an array for vectorized math
    query_arr = np.array(query_point)

    # 1. Compute Euclidean distances to all points in 'features'
    #    features has shape (N, 2), query_arr is shape (2,)
    diffs = features - query_arr  # shape (N, 2)
    squared_diffs = diffs ** 2    # shape (N, 2)
    dist_array = np.sqrt(np.sum(squared_diffs, axis=1))  # shape (N,)

    # 2. Sort by distance and pick the indices of the k nearest
    sorted_indices = np.argsort(dist_array)
    k_nearest_indices = sorted_indices[:k]

    # 3. Retrieve the labels of the k nearest neighbors
    k_labels = labels[k_nearest_indices]

    # 4. Count occurrences of each label in the k nearest
    counts = Counter(k_labels)

    # 5. Convert counts to probabilities
    probabilities = {}
    for cls in ALL_CLASSES:
        probabilities[cls] = counts[cls] / k

    return probabilities

print("Data and function loaded successfully.")

Data and function loaded successfully.


### Example Usage

We'll query the model with a hypothetical `launch_speed` and `launch_angle`, then print the probability distribution for each of the 8 classes.

In [2]:
# Example query:
query_point = (92.4,-13.0)
k_value = 10000

# Compute probabilities:
probs = knn_predict_probabilities(query_point, data_numeric, data_labels, k=k_value)

print(f"Probabilities for query = {query_point} (k={k_value}):")
for cls in ALL_CLASSES:
    print(f"  {cls:<15}: {probs[cls]:.3f}")


Probabilities for query = (92.4, -13.0) (k=10000):
  Single         : 0.142
  Double         : 0.041
  Triple         : 0.004
  Home Run       : 0.000
  Groundoutable  : 0.814
  Flyoutable     : 0.000
  Lineoutable    : 0.000


In [3]:
import numpy as np

# Let's define an empty dataset (N=0).
features_empty = np.array([]).reshape(0, 2)  # shape (0, 2)
labels_empty   = np.array([], dtype=str)     # shape (0,)

# Trigger case 1: N=0
print("=== EXAMPLE 1: N=0 dataset ===")
probs_empty = knn_predict_probabilities(query_point=(100.0, 5.0),
                                        features=features_empty,
                                        labels=labels_empty,
                                        k=3)
print("Returned probabilities:", probs_empty)
print()


# A small dataset with N=2 rows.
features_small = np.array([
    [85.0, 12.0],
    [99.0,  1.0],
])
labels_small   = np.array([
    "single",
    "double"
])

# Trigger case 2: N < k (here N=2, we set k=5)
print("=== EXAMPLE 2: N < k ===")
probs_small = knn_predict_probabilities(query_point=(100.0, 5.0),
                                        features=features_small,
                                        labels=labels_small,
                                        k=5)
print("Returned probabilities:", probs_small)


=== EXAMPLE 1: N=0 dataset ===
Returned probabilities: {'Single': 0.0, 'Double': 0.0, 'Triple': 0.0, 'Home Run': 0.0, 'Groundoutable': 0.0, 'Flyoutable': 0.0, 'Lineoutable': 0.0}

=== EXAMPLE 2: N < k ===
Returned probabilities: {'Single': 0.0, 'Double': 0.0, 'Triple': 0.0, 'Home Run': 0.0, 'Groundoutable': 0.0, 'Flyoutable': 0.0, 'Lineoutable': 0.0}


