In [1]:
import csv
import random
import math
from collections import Counter, defaultdict
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

In [2]:
filename = 'D:/University/Data Analysis & Machine Learning 4 course/data_lab2,3/mushroom/agaricus-lepiota.data'

# Load data
data = []
with open(filename, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        if len(row) == 0:
            continue
        data.append(row)

In [3]:
print("Number of instances loaded:", len(data))
print("Example rows:")
for i in range(5):
    print(data[i])

Number of instances loaded: 8124
Example rows:
['p', 'x', 's', 'n', 't', 'p', 'f', 'c', 'n', 'k', 'e', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 's', 'u']
['e', 'x', 's', 'y', 't', 'a', 'f', 'c', 'b', 'k', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'n', 'n', 'g']
['e', 'b', 's', 'w', 't', 'l', 'f', 'c', 'b', 'n', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'n', 'n', 'm']
['p', 'x', 'y', 'w', 't', 'p', 'f', 'c', 'n', 'n', 'e', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 's', 'u']
['e', 'x', 's', 'g', 'f', 'n', 'f', 'w', 'b', 'k', 't', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'e', 'n', 'a', 'g']


In [4]:
data = [row for row in data if '?' not in row]

In [5]:
classes = [row[0] for row in data]
features = [row[1:] for row in data]

In [6]:
class_map = {'e':0, 'p':1}
y = [class_map[c] for c in classes]

In [7]:
unique_values_per_feature = []
for col_idx in range(len(features[0])):
    unique_vals = sorted(list(set([row[col_idx] for row in features])))
    unique_values_per_feature.append(unique_vals)

In [8]:
for i, values in enumerate(unique_values_per_feature):
    print(f"Feature {i}: {values}")

Feature 0: ['b', 'c', 'f', 'k', 's', 'x']
Feature 1: ['f', 'g', 's', 'y']
Feature 2: ['b', 'c', 'e', 'g', 'n', 'p', 'w', 'y']
Feature 3: ['f', 't']
Feature 4: ['a', 'c', 'f', 'l', 'm', 'n', 'p']
Feature 5: ['a', 'f']
Feature 6: ['c', 'w']
Feature 7: ['b', 'n']
Feature 8: ['g', 'h', 'k', 'n', 'p', 'r', 'u', 'w', 'y']
Feature 9: ['e', 't']
Feature 10: ['b', 'c', 'e', 'r']
Feature 11: ['f', 'k', 's', 'y']
Feature 12: ['f', 'k', 's', 'y']
Feature 13: ['b', 'c', 'g', 'n', 'p', 'w', 'y']
Feature 14: ['b', 'c', 'g', 'n', 'p', 'w', 'y']
Feature 15: ['p']
Feature 16: ['w', 'y']
Feature 17: ['n', 'o', 't']
Feature 18: ['e', 'l', 'n', 'p']
Feature 19: ['h', 'k', 'n', 'r', 'u', 'w']
Feature 20: ['a', 'c', 'n', 's', 'v', 'y']
Feature 21: ['d', 'g', 'l', 'm', 'p', 'u']


In [9]:
X = []
for row in features:
    encoded = []
    for col_idx, val in enumerate(row):
        val_index = unique_values_per_feature[col_idx].index(val)
        encoded.append(val_index)
    X.append(encoded)
     
X = [list(r) for r in X]

In [10]:
random.seed(42)
indices = list(range(len(X)))
random.shuffle(indices)
split_point = int(0.8 * len(X))

train_idx = indices[:split_point]
val_idx = indices[split_point:]

X_train = [X[i] for i in train_idx]
y_train = [y[i] for i in train_idx]
X_val = [X[i] for i in val_idx]
y_val = [y[i] for i in val_idx]

X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)

In [11]:
print("A few training samples with labels:")
for i in range(5):
    print(f"№{i} Class: {y_train[i]} Features: {X_train[i]}")

print("\nA few validation samples with labels:")
for i in range(5):
    print(f"№{i} Class: {y_val[i]} Features: {X_val[i]}")

A few training samples with labels:
№0 Class: 0 Features: [5 0 4 1 5 1 0 0 7 1 0 2 2 4 4 0 0 1 3 1 5 0]
№1 Class: 1 Features: [5 2 0 1 2 1 0 0 1 1 0 0 2 5 5 0 0 1 3 0 4 5]
№2 Class: 1 Features: [5 0 3 0 2 1 0 0 4 0 0 1 1 0 3 0 0 1 1 0 5 0]
№3 Class: 1 Features: [5 0 3 0 1 1 1 1 6 0 0 2 2 5 5 0 0 1 3 2 4 0]
№4 Class: 1 Features: [5 3 6 1 6 1 0 1 2 0 2 2 2 5 5 0 0 1 3 2 3 1]

A few validation samples with labels:
№0 Class: 0 Features: [5 2 6 1 0 1 0 0 7 0 1 2 2 5 5 0 0 1 3 1 3 1]
№1 Class: 1 Features: [5 3 4 1 6 1 0 1 3 0 2 2 2 5 5 0 0 1 3 2 3 5]
№2 Class: 0 Features: [5 0 6 0 5 1 1 0 4 1 2 0 2 5 5 0 0 1 0 1 3 1]
№3 Class: 0 Features: [2 2 6 0 5 1 1 0 4 1 2 2 2 5 5 0 0 1 0 2 0 1]
№4 Class: 1 Features: [5 2 0 1 2 1 0 0 1 1 0 0 0 5 5 0 0 1 3 0 3 1]


In [12]:
def chi_square_feature(X, y, feature_index):
    # Extract the column
    col = [x[feature_index] for x in X]
    # Possible values for this feature
    vals = set(col)
    # Classes
    cls = set(y)

    # Create contingency table
    # counts[val][class] = count
    counts = defaultdict(lambda: defaultdict(int))
    val_counts = defaultdict(int)
    cls_counts = defaultdict(int)

    for val_i, c in zip(col, y):
        counts[val_i][c] += 1
        val_counts[val_i] += 1
        cls_counts[c] += 1

    total = len(X)

    # Compute chi-square
    chi2 = 0.0
    for v in val_counts:
        for c in cls_counts:
            observed = counts[v][c]
            expected = (val_counts[v] * cls_counts[c]) / total
            if expected > 0:
                chi2 += (observed - expected)**2 / expected

    return chi2

In [13]:
chi2_scores = []
for f_idx in range(len(X_train[0])):
    score = chi_square_feature(X_train, y_train, f_idx)
    chi2_scores.append((f_idx, score))

# Sort features by chi2 score descending
chi2_scores.sort(key=lambda x: x[1], reverse=True)

In [14]:
print("Top 10 features by Chi-Squared score:")
for f_idx, score in chi2_scores[:10]:
    print(f"Feature {f_idx}: {score}")

Top 10 features by Chi-Squared score:
Feature 4: 4216.231587517972
Feature 19: 3066.755544614285
Feature 18: 2368.9379968954386
Feature 11: 2273.2428804854344
Feature 12: 2177.760791727101
Feature 13: 1571.7964100729591
Feature 9: 1562.7992228158053
Feature 14: 1461.0214147272734
Feature 8: 1224.2251270707259
Feature 2: 1027.012391729265


In [15]:
class KNN:
    def __init__(self, k=5):
        self.k = k
        self.model = KNeighborsClassifier(n_neighbors=self.k)

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        # Use bulk prediction directly
        return self.model.predict(X)

In [16]:
def accuracy(y_true, y_pred):
    correct = sum(int(a==b) for a,b in zip(y_true, y_pred))
    return correct / len(y_true)

In [17]:
def forward_selection(X_train, y_train, X_val, y_val, max_features=None):
    available_features = set(range(len(X_train[0])))
    selected_features = []
    best_acc = 0.0
    last_improvement = True

    # We can stop either when no improvement or if we reached max_features
    while last_improvement and (max_features is None or len(selected_features) < max_features):
        last_improvement = False
        candidate_feature = None

        for f in available_features:
            trial_features = selected_features + [f]
            # Extract these features from training and validation
            X_train_sub = [[x[i] for i in trial_features] for x in X_train]
            X_val_sub = [[x[i] for i in trial_features] for x in X_val]

            model = KNN(k=5)
            model.fit(X_train_sub, y_train)
            y_val_pred = model.predict(X_val_sub)
            val_acc = accuracy(y_val, y_val_pred)
            if val_acc > best_acc:
                best_acc = val_acc
                candidate_feature = f

        if candidate_feature is not None:
            selected_features.append(candidate_feature)
            available_features.remove(candidate_feature)
            last_improvement = True

    return selected_features

In [18]:
wrapper_selected_features = forward_selection(X_train, y_train, X_val, y_val)

In [19]:
def evaluate_top_n_features_chi2(n):
    top_features = [f_idx for (f_idx, _) in chi2_scores[:n]]
    X_train_sub = [[x[i] for i in top_features] for x in X_train]
    X_val_sub = [[x[i] for i in top_features] for x in X_val]

    model = KNN(k=5)
    model.fit(X_train_sub, y_train)
    y_val_pred = model.predict(X_val_sub)
    return accuracy(y_val, y_val_pred)

In [20]:
best_chi2_n = None
best_chi2_acc = 0.0
# Let's just try from 1 to all features:
for n in range(1, len(X_train[0]) + 1):
    val_acc = evaluate_top_n_features_chi2(n)
    if val_acc > best_chi2_acc:
        best_chi2_acc = val_acc
        best_chi2_n = n

chi2_selected_features = [f_idx for (f_idx, _) in chi2_scores[:best_chi2_n]]

In [21]:
model_all = KNN(k=5)
model_all.fit(X_train, y_train)
y_val_pred_all = model_all.predict(X_val)
acc_all = accuracy(y_val, y_val_pred_all)

# Filter method features
X_train_chi2 = [[x[i] for i in chi2_selected_features] for x in X_train]
X_val_chi2 = [[x[i] for i in chi2_selected_features] for x in X_val]
model_chi2 = KNN(k=5)
model_chi2.fit(X_train_chi2, y_train)
y_val_pred_chi2 = model_chi2.predict(X_val_chi2)
acc_chi2 = accuracy(y_val, y_val_pred_chi2)

# Wrapper method features
X_train_wrapper = [[x[i] for i in wrapper_selected_features] for x in X_train]
X_val_wrapper = [[x[i] for i in wrapper_selected_features] for x in X_val]
model_wrapper = KNN(k=5)
model_wrapper.fit(X_train_wrapper, y_train)
y_val_pred_wrapper = model_wrapper.predict(X_val_wrapper)
acc_wrapper = accuracy(y_val, y_val_pred_wrapper)

In [23]:
# After determining the subsets of features
print("Number of features selected by filter method:", best_chi2_n)
print("Number of features selected by wrapper method:", len(wrapper_selected_features))

# Check a sample of the training data with Chi2 selected features
X_train_chi2_sample = X_train_chi2[:5]
X_val_chi2_sample = X_val_chi2[:5]
print("Sample of training data (chi2 selected features, first 5 rows):")
for row in X_train_chi2_sample:
    print(row)

print("Sample of validation data (chi2 selected features, first 5 rows):")
for row in X_val_chi2_sample:
    print(row)

# Similarly for wrapper selected features
X_train_wrapper_sample = X_train_wrapper[:5]
X_val_wrapper_sample = X_val_wrapper[:5]
print("Sample of training data (wrapper selected features, first 5 rows):")
for row in X_train_wrapper_sample:
    print(row)

print("Sample of validation data (wrapper selected features, first 5 rows):")
for row in X_val_wrapper_sample:
    print(row)

# After you get predictions, print a few predictions vs. actual labels
print("Checking a few predictions vs. actual labels for chi2 selected features:")
for i in range(5):
    print(f"Validation sample {i}: True label = {y_val[i]}, Predicted label = {y_val_pred_chi2[i]}")

print("Checking a few predictions vs. actual labels for wrapper selected features:")
for i in range(5):
    print(f"Validation sample {i}: True label = {y_val[i]}, Predicted label = {y_val_pred_wrapper[i]}")

# Check correct predictions count for chi2 selected features
correct_predictions_chi2 = sum(1 for yt, yp in zip(y_val, y_val_pred_chi2) if yt == yp)
print(f"Chi2 correct predictions: {correct_predictions_chi2} out of {len(y_val)}, accuracy: {acc_chi2}")

# Check correct predictions count for wrapper selected features
correct_predictions_wrapper = sum(1 for yt, yp in zip(y_val, y_val_pred_wrapper) if yt == yp)
print(f"Wrapper correct predictions: {correct_predictions_wrapper} out of {len(y_val)}, accuracy: {acc_wrapper}")

# Check a single instance before and after feature selection
sample_index = 0
print("Original features for sample 0:", X_train[sample_index])
print("Label:", y_train[sample_index])
print("Reduced features (chi2):", [X_train[sample_index][i] for i in chi2_selected_features])
print("Reduced features (wrapper):", [X_train[sample_index][i] for i in wrapper_selected_features])


# Finally, print all accuracies again for clarity
print("Validation Accuracy (All features):", acc_all)
print("Validation Accuracy (Filter method):", acc_chi2)
print("Validation Accuracy (Wrapper method):", acc_wrapper)

Number of features selected by filter method: 10
Number of features selected by wrapper method: 3
Sample of training data (chi2 selected features, first 5 rows):
[5, 1, 3, 2, 2, 4, 1, 4, 7, 4]
[2, 0, 3, 0, 2, 5, 1, 5, 1, 0]
[2, 0, 1, 1, 1, 0, 0, 3, 4, 3]
[1, 2, 3, 2, 2, 5, 0, 5, 6, 3]
[6, 2, 3, 2, 2, 5, 0, 5, 2, 6]
Sample of validation data (chi2 selected features, first 5 rows):
[0, 1, 3, 2, 2, 5, 0, 5, 7, 6]
[6, 2, 3, 2, 2, 5, 0, 5, 3, 4]
[5, 1, 0, 0, 2, 5, 1, 5, 4, 6]
[5, 2, 0, 2, 2, 5, 1, 5, 4, 6]
[2, 0, 3, 0, 0, 5, 1, 5, 1, 0]
Sample of training data (wrapper selected features, first 5 rows):
[5, 1, 4]
[2, 0, 0]
[2, 0, 3]
[1, 2, 3]
[6, 2, 6]
Sample of validation data (wrapper selected features, first 5 rows):
[0, 1, 6]
[6, 2, 4]
[5, 1, 6]
[5, 2, 6]
[2, 0, 0]
Checking a few predictions vs. actual labels for chi2 selected features:
Validation sample 0: True label = 0, Predicted label = 0
Validation sample 1: True label = 1, Predicted label = 1
Validation sample 2: True label = 0, Pr

In [24]:
inverse_mapping = []
for vals in unique_values_per_feature:
    # vals is a list of categories for that feature
    # Create a dict mapping index->original_category
    inv_map = {i: cat for i, cat in enumerate(vals)}
    inverse_mapping.append(inv_map)

# Inverse class map
inv_class_map = {0: 'e', 1: 'p'}

def decode_full_sample(numeric_sample):
    """
    Decode a full sample (with all features) from numeric form back to original categories.
    """
    return [inverse_mapping[i][val] for i, val in enumerate(numeric_sample)]

def decode_sample_with_subset(numeric_sample, subset_features):
    """
    Decode a sample using only the given subset of features.
    """
    return [inverse_mapping[feat_idx][numeric_sample[feat_idx]] for feat_idx in subset_features]

print("\nA few validation samples with original categories and labels:")
for i in range(5):
    original_features = decode_full_sample(X_val[i])
    original_label = inv_class_map[y_val[i]]
    feat_str = ", ".join(f"{f_idx}:{val}" for f_idx, val in enumerate(original_features))
    print(f"№{i} Class: {original_label} Features: [{feat_str}]")

# After selecting features with Chi-Squared (validation) and predictions
print("\nA few validation samples (Chi-Squared selected features) with predictions in original categories:")
for i in range(5):
    reduced_features_chi2_val = decode_sample_with_subset(X_val[i], chi2_selected_features)
    original_label = inv_class_map[y_val[i]]
    predicted_label = inv_class_map[y_val_pred_chi2[i]]
    feat_str = ", ".join(f"{f_idx}:{val}" for f_idx, val in zip(chi2_selected_features, reduced_features_chi2_val))
    print(f"№{i} Actual: {original_label} Predicted: {predicted_label} Features: [{feat_str}]")

# After selecting features with the Wrapper method (validation) and predictions
print("\nA few validation samples (Wrapper selected features) with predictions in original categories:")
for i in range(5):
    reduced_features_wrapper_val = decode_sample_with_subset(X_val[i], wrapper_selected_features)
    original_label = inv_class_map[y_val[i]]
    predicted_label = inv_class_map[y_val_pred_wrapper[i]]
    feat_str = ", ".join(f"{f_idx}:{val}" for f_idx, val in zip(wrapper_selected_features, reduced_features_wrapper_val))
    print(f"№{i} Actual: {original_label} Predicted: {predicted_label} Features: [{feat_str}]")


A few validation samples with original categories and labels:
№0 Class: e Features: [0:x, 1:s, 2:w, 3:t, 4:a, 5:f, 6:c, 7:b, 8:w, 9:e, 10:c, 11:s, 12:s, 13:w, 14:w, 15:p, 16:w, 17:o, 18:p, 19:k, 20:s, 21:g]
№1 Class: p Features: [0:x, 1:y, 2:n, 3:t, 4:p, 5:f, 6:c, 7:n, 8:n, 9:e, 10:e, 11:s, 12:s, 13:w, 14:w, 15:p, 16:w, 17:o, 18:p, 19:n, 20:s, 21:u]
№2 Class: e Features: [0:x, 1:f, 2:w, 3:f, 4:n, 5:f, 6:w, 7:b, 8:p, 9:t, 10:e, 11:f, 12:s, 13:w, 14:w, 15:p, 16:w, 17:o, 18:e, 19:k, 20:s, 21:g]
№3 Class: e Features: [0:f, 1:s, 2:w, 3:f, 4:n, 5:f, 6:w, 7:b, 8:p, 9:t, 10:e, 11:s, 12:s, 13:w, 14:w, 15:p, 16:w, 17:o, 18:e, 19:n, 20:a, 21:g]
№4 Class: p Features: [0:x, 1:s, 2:b, 3:t, 4:f, 5:f, 6:c, 7:b, 8:h, 9:t, 10:b, 11:f, 12:f, 13:w, 14:w, 15:p, 16:w, 17:o, 18:p, 19:h, 20:s, 21:g]

A few validation samples (Chi-Squared selected features) with predictions in original categories:
№0 Actual: e Predicted: e Features: [4:a, 19:k, 18:p, 11:s, 12:s, 13:w, 9:e, 14:w, 8:w, 2:w]
№1 Actual: p Predict

In [25]:
print("Number of features selected by filter method:", best_chi2_n)
print("Number of features selected by wrapper method:", len(wrapper_selected_features))

print("Validation Accuracy (All features):", acc_all)
print("Validation Accuracy (Filter method):", acc_chi2)
print("Validation Accuracy (Wrapper method):", acc_wrapper)

Number of features selected by filter method: 10
Number of features selected by wrapper method: 3
Validation Accuracy (All features): 1.0
Validation Accuracy (Filter method): 1.0
Validation Accuracy (Wrapper method): 1.0
