In [902]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [903]:
cleaned_data_path = (
    "C:\\Users\\htoll\\Desktop\\Uni\\Y3S1\\COMP4702\\report\\Cleaned_data.csv"
)
data = pd.read_csv(cleaned_data_path)

In [904]:
# Separate features and target
X = data.drop(columns=["Species_Population"])
y = data["Species_Population"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)



In [905]:
# Function to add noise to data
def add_noise(X, noise_percent):
    std_devs = X.std(axis=0)
    noise = np.random.normal(0, std_devs, X.shape) * (noise_percent / 100)
    X_noisy = X + noise
    return X_noisy

# Function to add noise to data
noise_levels = [0.01, 0.025, 0.05, 0.1, 0.2]

In [906]:
# Adding noise to the training data before scaling
X_train_noisy_list = [add_noise(X_train, noise) for noise in noise_levels]

# Standardize the data (including noisy versions)
scaler = StandardScaler()
X_train_scaled_list = [scaler.fit_transform(X_noisy) for X_noisy in X_train_noisy_list]
X_test_scaled = scaler.transform(X_test)

In [907]:
# Define the best KNN model parameters
best_knn = KNeighborsClassifier(n_neighbors=9, weights="distance", metric="manhattan")

In [908]:
# Train and evaluate the model on each noisy dataset
results = []
for i, X_train_scaled in enumerate(X_train_scaled_list):
    # Train the model
    best_knn.fit(X_train_scaled, y_train)
    # Predict the labels for the test data
    y_pred = best_knn.predict(X_test_scaled)
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    noise_level = noise_levels[i] * 100
    results.append((f"{noise_level}%", accuracy))
    print(f"Trained on {noise_level}% noise, Accuracy: {accuracy:.4f}")

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results, columns=["Noise_Level", "Accuracy"])
print(results_df)

Trained on 1.0% noise, Accuracy: 0.7139
Trained on 2.5% noise, Accuracy: 0.7139
Trained on 5.0% noise, Accuracy: 0.7139
Trained on 10.0% noise, Accuracy: 0.7110
Trained on 20.0% noise, Accuracy: 0.7225
  Noise_Level  Accuracy
0        1.0%  0.713873
1        2.5%  0.713873
2        5.0%  0.713873
3       10.0%  0.710983
4       20.0%  0.722543
