KNN

1)

In [6]:
import pandas as pd
import numpy as np

def euclidean_distance(row1, row2):
    """Calculate the Euclidean distance between two vectors."""
    distance = np.sqrt(np.sum((row1 - row2) ** 2))
    return distance

def get_neighbors(train, test_row, num_neighbors):
    """Get the nearest neighbors for a test instance."""
    distances = []
    for index, train_row in train.iterrows():
        dist = euclidean_distance(test_row, train_row[:-1])
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = distances[:num_neighbors]
    return [neighbor[0] for neighbor in neighbors]

def predict_regression(train, test_row, num_neighbors):
    """Make a prediction with neighbors."""
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [neighbor[-1] for neighbor in neighbors]
    prediction = np.mean(output_values)
    return prediction

# Load dataset
df = pd.read_csv('C:/MyFiles/Notes/4th Semester/UCS411 - Artificial Intelligence/Lab Assignments/lab/Salary_Prediction_Dataset.csv')

# Define features and target
X = df[['Experience', 'Written_Score', 'Interview_Score']]
y = df['Salary']
train_data = pd.concat([X, y], axis=1)

# Example test data
test_data = np.array([5, 8, 10])  # Candidate (a)
test_data2 = np.array([8, 7, 6])  # Candidate (b)

# Predict salary using a K value
K = 5
prediction1 = predict_regression(train_data, test_data, K)
prediction2 = predict_regression(train_data, test_data2, K)

print("Predicted Salary for Candidate (a):", prediction1)
print("Predicted Salary for Candidate (b):", prediction2)


Predicted Salary for Candidate (a): 89812.2
Predicted Salary for Candidate (b): 120024.4


2)

In [7]:
import pandas as pd
import numpy as np
from math import sqrt, pi, exp

def split_data(data, test_size=0.2):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_size)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

def calculate_prior(data):
    classes = np.unique(data['Selection'])
    total_count = len(data)
    priors = {}
    for cls in classes:
        priors[cls] = len(data[data['Selection'] == cls]) / total_count
    return priors

def calculate_likelihood(data, feature_name, feature_val, label):
    filtered_data = data[data['Selection'] == label]
    mean, std = filtered_data[feature_name].mean(), filtered_data[feature_name].std()
    exponent = exp(-((feature_val - mean) ** 2 / (2 * std ** 2)))
    return (1 / (sqrt(2 * pi) * std)) * exponent

def classify(data, priors, row):
    classes = np.unique(data['Selection'])
    probabilities = {}
    for cls in classes:
        probabilities[cls] = priors[cls]
        for index in range(len(row)-1):  # last index is the target class
            probabilities[cls] *= calculate_likelihood(data, data.columns[index], row[index], cls)
    return max(probabilities, key=probabilities.get)

# Load data
data = pd.read_csv(r'C:\MyFiles\Notes\4th Semester\UCS411 - Artificial Intelligence\Lab Assignments\lab\HR_Selection_Dataset.csv')

# Split data
train_data, test_data = split_data(data, test_size=0.2)

# Calculate prior probabilities
priors = calculate_prior(train_data)

# Classify unseen data
unseen_data = [
    [90, 5, 8, 10],  # Candidate (a)
    [75, 8, 7, 6]   # Candidate (b)
]

predictions = [classify(train_data, priors, row) for row in unseen_data]
print("Predictions for unseen data:", predictions)


Predictions for unseen data: [1, 1]


3)

In [8]:
import numpy as np

class DecisionNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

def entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log2(probabilities + 1e-15))

def information_gain(X, y, feature_index, threshold):
    parent_entropy = entropy(y)
    left_indices = X[:, feature_index] < threshold
    right_indices = ~left_indices
    n = len(y)
    n_left, n_right = np.sum(left_indices), np.sum(right_indices)
    if n_left == 0 or n_right == 0:
        return 0
    e_left, e_right = entropy(y[left_indices]), entropy(y[right_indices])
    n_left, n_right = len(y[left_indices]), len(y[right_indices])
    weighted_entropy = (n_left / n) * e_left + (n_right / n) * e_right
    return parent_entropy - weighted_entropy

def best_split(X, y):
    best_gain = 0
    split = None
    n_features = X.shape[1]
    for feature_index in range(n_features):
        thresholds = np.unique(X[:, feature_index])
        for threshold in thresholds:
            gain = information_gain(X, y, feature_index, threshold)
            if gain > best_gain:
                best_gain = gain
                split = (feature_index, threshold)
    return split

def build_tree(X, y, depth=0, max_depth=None, min_samples_split=2):
    num_samples, num_features = X.shape
    if num_samples >= min_samples_split and (max_depth is None or depth < max_depth):
        split = best_split(X, y)
        if split is not None:
            feature, threshold = split
            left_indices = X[:, feature] < threshold
            right_indices = ~left_indices
            left_subtree = build_tree(X[left_indices], y[left_indices], depth+1, max_depth, min_samples_split)
            right_subtree = build_tree(X[right_indices], y[right_indices], depth+1, max_depth, min_samples_split)
            return DecisionNode(feature, threshold, left_subtree, right_subtree)
    return DecisionNode(value=np.bincount(y).argmax())

def predict(node, x):
    while node.value is None:
        if x[node.feature] < node.threshold:
            node = node.left
        else:
            node = node.right
    return node.value

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

# Example usage with the Iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target

# Split data manually for simplicity in this example
np.random.seed(0)
indices = np.random.permutation(len(X))
split_index = int(len(X) * 0.8)
train_indices, test_indices = indices[:split_index], indices[split_index:]
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

# Build the decision tree
tree = build_tree(X_train, y_train, max_depth=3, min_samples_split=4)

# Predictions
y_pred = [predict(tree, xi) for xi in X_test]
print("Accuracy:", accuracy(y_test, y_pred))


Accuracy: 0.9333333333333333


4)

In [12]:
import numpy as np
import pandas as pd

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = np.array([self._predict(x) for x in X])
        return y_pred

    def _predict(self, x):
        # Compute the Euclidean distance between x and all examples in the training set
        distances = np.linalg.norm(self.X_train - x, axis=1)
        # Get the indices of the k nearest samples
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        # Majority vote, most common class label
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common

class DecisionTree:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.tree_ = self._grow_tree(X, y)

    def predict(self, X):
        return np.array([self._predict(inputs, self.tree_) for inputs in X])

    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in np.unique(y)]
        predicted_class = np.argmax(num_samples_per_class)
        node = {'type': 'leaf', 'class': predicted_class}

        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node['type'] = 'node'
                node['index'] = idx
                node['threshold'] = thr
                node['left'] = self._grow_tree(X_left, y_left, depth + 1)
                node['right'] = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _best_split(self, X, y):
        m, n = X.shape
        if m <= 1:
            return None, None

        num_parent = [np.sum(y == c) for c in range(len(set(y)))]
        best_gini = 1.0 - sum((np.sum(y == c) / m) ** 2 for c in range(len(set(y))))
        best_idx, best_thr = None, None

        for idx in range(n):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * len(num_parent)
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in range(len(num_parent)))
                gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in range(len(num_parent)))
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx, best_thr = idx, (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr

    def _predict(self, inputs, node):
        if node['type'] == 'leaf':
            return node['class']
        if inputs[node['index']] < node['threshold']:
            return self._predict(inputs, node['left'])
        return self._predict(inputs, node['right'])

# Add GaussianNaiveBayes class and data loading here as previously defined

# Function to evaluate models
def evaluate_models(X_train, X_test, y_train, y_test):
    results = []
    
    # Define models and parameters
    knn_params = [1, 3, 5]
    nb_params = [GaussianNaiveBayes()]  # Assuming only one configuration for simplicity
    dt_params = [None, 3, 5]  # Decision tree depths

    # KNN Model Evaluation
    for k in knn_params:
        model = KNN(k=k)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = (y_pred == y_test).mean()
        results.append(['KNN', k, accuracy])

    # Naive Bayes Model Evaluation
    for model in nb_params:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = (y_pred == y_test).mean()
        results.append(['Naive Bayes', 'default', accuracy])

    # Decision Tree Model Evaluation
    for max_depth in dt_params:
        model = DecisionTree(max_depth=max_depth)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = (y_pred == y_test).mean()
        results.append(['Decision Tree', max_depth, accuracy])

    return results

# Example data loading
# X_train, X_test, y_train, y_test = your_data_loading_function()

# Run evaluation
results = evaluate_models(X_train, X_test, y_train, y_test)

# Save results to CSV
results_df = pd.DataFrame(results, columns=['Model', 'Parameter', 'Accuracy'])
results_df.to_csv('model_comparisons.csv', index=False)


TypeError: '<' not supported between instances of 'int' and 'NoneType'