In [3]:

#CSE-6363-002-MACHINE LEARNING
#Gowtham Kumar Kanchi
#1002044003 



# Naive Bayes On The Iris Dataset
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
import csv
from collections import defaultdict
import numpy as np
from typing import List, Tuple
def load_data(filename):
    with open(filename, 'r') as file:
        return [row for row in csv.reader(file) if row]


def convert_column_to_float(data_list, col_idx):
    for sample in data_list:
        value_str = sample[col_idx].strip()
        sample[col_idx] = float(value_str)

def convert_column_to_integer(data, column_index):
    class_mapping = defaultdict(int)
    for sample in data:
        value = sample[column_index]
        if value not in class_mapping:
            class_mapping[value] = len(class_mapping)
        sample[column_index] = class_mapping[value]
    return dict(class_mapping)


def convert_col_to_int_car(dataset, col_idx):
    for row in dataset:
        value = row[col_idx].strip()
        if value == 'more' or value == '5more':
            row[col_idx] = 5
        else:
            row[col_idx] = int(value)
    return dataset




def split_data_into_folds(data, num_folds):
    np.random.shuffle(data)
    fold_size = len(data) // num_folds
    folds_list = [data[i*fold_size:(i+1)*fold_size] for i in range(num_folds)]
    return folds_list


def calculate_accuracy(true_labels, predicted_labels):
    num_correct = sum([1 for true, pred in zip(true_labels, predicted_labels) if true == pred])
    return num_correct / len(true_labels) * 100




def evaluate_algorithm(data: List[List], algorithm: callable, num_folds: int, *args) -> List[float]:
    def split_data_into_folds(data: List[List], num_folds: int) -> List[Tuple[List[List], List[List]]]:
        n_rows = len(data)
        fold_size = n_rows // num_folds
        folds = []
        for i in range(num_folds):
            start = i * fold_size
            end = start + fold_size
            fold = (data[:start] + data[end:], data[start:end])
            folds.append(fold)
        return folds

    def calculate_accuracy(actual: List, predicted: List) -> float:
        num_correct = sum(1 for a, p in zip(actual, predicted) if a == p)
        return num_correct / len(actual)

    scores = []
    for train_data, test_data in split_data_into_folds(data, num_folds):
        predicted = algorithm(train_data, test_data, *args)
        actual = [row[-1] for row in test_data]
        accuracy = calculate_accuracy(actual, predicted)
        scores.append(accuracy)
    return scores




# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

# Calculate the mean of a list of numbers
def calculate_mean(numbers):
	return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def calculate_std_dev(numbers):
    avg_num = calculate_mean(numbers)
    var_sum = sum([(x - avg_num) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return sqrt(var_sum)

# Calculate the mean, standard deviation and count for each column in a dataset
def summarize_data(dataset):
    data_summary = [(calculate_mean(column), calculate_std_dev(column), len(column)) for column in zip(*dataset)]
    del data_summary[-1]
    return data_summary

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    class_dict = separate_by_class(dataset)
    class_summary = {}
    for class_val, rows in class_dict.items():
        class_summary[class_val] = summarize_data(rows)
    return class_summary

# Calculate the Gaussian probability distribution function for x
def calculate_prob_distribution(x, mean, std_dev):
    exponent = exp(-((x - mean) ** 2 / (2 * std_dev ** 2)))
    return (1 / (sqrt(2 * pi) * std_dev)) * exponent

def calculate_class_probabilities(class_summary, row):
    total_rows = 0
    class_count = {}
    for class_val in class_summary:
        class_count[class_val] = 0
        for i in range(len(class_summary[class_val])):
            class_count[class_val] += class_summary[class_val][i][2]
            total_rows += class_summary[class_val][i][2]
    class_probabilities = {}
    for class_val in class_summary:
        class_probability = 1
        for i in range(len(class_summary[class_val])):
            mean_val, std_dev_val, _ = class_summary[class_val][i]
            x = row[i]
            if std_dev_val == 0:
                if x != mean_val:
                    class_probability = 0
                    break
            else:
                numerator = 1
                denominator = sqrt(2*pi)*std_dev_val
                exponent = -1/2 * ((x-mean_val)/std_dev_val)**2
                probability_density = numerator/denominator * exp(exponent)
                class_probability *= probability_density
        class_probability *= class_count[class_val]/total_rows
        class_probabilities[class_val] = class_probability
    return class_probabilities



def predict_class(summaries, row):
    class_probabilities = calculate_class_probabilities(summaries, row)
    best_label = max(class_probabilities, key=class_probabilities.get)
    return best_label


def naive_bayes(train_set, test_set):
    summaries = summarize_by_class(train_set)
    predictions = [predict_class(summaries, row) for row in test_set]
    return predictions

seed(1)
filename = 'car_evaluation.csv'
dataset = load_data(filename)


if filename == 'breast_cancer.csv':
    for column_index in range(len(dataset[0])):
        class_mapping = convert_column_to_integer(dataset, column_index)
if filename == 'car_evaluation.csv':
    for sample in dataset:
        del sample[5]
    for column_index in range(2, 4):
        convert_col_to_int_car(dataset, column_index)
    for column_index in range(len(dataset[0])):
        if column_index not in (2, 3):
            class_mapping = convert_column_to_integer(dataset, column_index)
if filename == 'hayes_r.csv':
    for column_index in range(len(dataset[0])-1):
        convert_column_to_float(dataset, column_index)
    class_mapping = convert_column_to_integer(dataset, len(dataset[0])-1)

# evaluate algorithm
n_folds = 10
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print("car_evaluation Dataset:")
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))*100))

car_evaluation Dataset:
Scores: [0.7732558139534884, 0.7732558139534884, 0.7441860465116279, 0.7616279069767442, 0.6976744186046512, 0.7093023255813954, 0.6337209302325582, 0.5232558139534884, 0.622093023255814, 0.46511627906976744]
Mean Accuracy: 67.035%


In [2]:
#CSE-6363-002-MACHINE LEARNING
#Gowtham Kumar Kanchi
#1002044003 

# Naive Bayes On The Iris Dataset
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
import csv
from collections import defaultdict
import numpy as np
from typing import List, Tuple
def load_data(filename):
    with open(filename, 'r') as file:
        return [row for row in csv.reader(file) if row]


def convert_column_to_float(data_list, col_idx):
    for sample in data_list:
        value_str = sample[col_idx].strip()
        sample[col_idx] = float(value_str)

def convert_column_to_integer(data, column_index):
    class_mapping = defaultdict(int)
    for sample in data:
        value = sample[column_index]
        if value not in class_mapping:
            class_mapping[value] = len(class_mapping)
        sample[column_index] = class_mapping[value]
    return dict(class_mapping)


def convert_col_to_int_car(dataset, col_idx):
    for row in dataset:
        value = row[col_idx].strip()
        if value == 'more' or value == '5more':
            row[col_idx] = 5
        else:
            row[col_idx] = int(value)
    return dataset




def split_data_into_folds(data, num_folds):
    np.random.shuffle(data)
    fold_size = len(data) // num_folds
    folds_list = [data[i*fold_size:(i+1)*fold_size] for i in range(num_folds)]
    return folds_list


def calculate_accuracy(true_labels, predicted_labels):
    num_correct = sum([1 for true, pred in zip(true_labels, predicted_labels) if true == pred])
    return num_correct / len(true_labels) * 100




def evaluate_algorithm(data: List[List], algorithm: callable, num_folds: int, *args) -> List[float]:
    def split_data_into_folds(data: List[List], num_folds: int) -> List[Tuple[List[List], List[List]]]:
        n_rows = len(data)
        fold_size = n_rows // num_folds
        folds = []
        for i in range(num_folds):
            start = i * fold_size
            end = start + fold_size
            fold = (data[:start] + data[end:], data[start:end])
            folds.append(fold)
        return folds

    def calculate_accuracy(actual: List, predicted: List) -> float:
        num_correct = sum(1 for a, p in zip(actual, predicted) if a == p)
        return num_correct / len(actual)

    scores = []
    for train_data, test_data in split_data_into_folds(data, num_folds):
        predicted = algorithm(train_data, test_data, *args)
        actual = [row[-1] for row in test_data]
        accuracy = calculate_accuracy(actual, predicted)
        scores.append(accuracy)
    return scores




# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

# Calculate the mean of a list of numbers
def calculate_mean(numbers):
	return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def calculate_std_dev(numbers):
    avg_num = calculate_mean(numbers)
    var_sum = sum([(x - avg_num) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return sqrt(var_sum)

# Calculate the mean, standard deviation and count for each column in a dataset
def summarize_data(dataset):
    data_summary = [(calculate_mean(column), calculate_std_dev(column), len(column)) for column in zip(*dataset)]
    del data_summary[-1]
    return data_summary

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    class_dict = separate_by_class(dataset)
    class_summary = {}
    for class_val, rows in class_dict.items():
        class_summary[class_val] = summarize_data(rows)
    return class_summary

# Calculate the Gaussian probability distribution function for x
def calculate_prob_distribution(x, mean, std_dev):
    exponent = exp(-((x - mean) ** 2 / (2 * std_dev ** 2)))
    return (1 / (sqrt(2 * pi) * std_dev)) * exponent

def calculate_class_probabilities(class_summary, row):
    total_rows = 0
    class_count = {}
    for class_val in class_summary:
        class_count[class_val] = 0
        for i in range(len(class_summary[class_val])):
            class_count[class_val] += class_summary[class_val][i][2]
            total_rows += class_summary[class_val][i][2]
    class_probabilities = {}
    for class_val in class_summary:
        class_probability = 1
        for i in range(len(class_summary[class_val])):
            mean_val, std_dev_val, _ = class_summary[class_val][i]
            x = row[i]
            if std_dev_val == 0:
                if x != mean_val:
                    class_probability = 0
                    break
            else:
                numerator = 1
                denominator = sqrt(2*pi)*std_dev_val
                exponent = -1/2 * ((x-mean_val)/std_dev_val)**2
                probability_density = numerator/denominator * exp(exponent)
                class_probability *= probability_density
        class_probability *= class_count[class_val]/total_rows
        class_probabilities[class_val] = class_probability
    return class_probabilities



def predict_class(summaries, row):
    class_probabilities = calculate_class_probabilities(summaries, row)
    best_label = max(class_probabilities, key=class_probabilities.get)
    return best_label


def naive_bayes(train_set, test_set):
    summaries = summarize_by_class(train_set)
    predictions = [predict_class(summaries, row) for row in test_set]
    return predictions

seed(1)
filename = 'breast_cancer.csv'
dataset = load_data(filename)


if filename == 'breast_cancer.csv':
    for column_index in range(len(dataset[0])):
        class_mapping = convert_column_to_integer(dataset, column_index)
if filename == 'car_evaluation.csv':
    for sample in dataset:
        del sample[5]
    for column_index in range(2, 4):
        convert_col_to_int_car(dataset, column_index)
    for column_index in range(len(dataset[0])):
        if column_index not in (2, 3):
            class_mapping = convert_column_to_integer(dataset, column_index)
if filename == 'hayes_r.csv':
    for column_index in range(len(dataset[0])-1):
        convert_column_to_float(dataset, column_index)
    class_mapping = convert_column_to_integer(dataset, len(dataset[0])-1)

# evaluate algorithm
n_folds = 10
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print("Breast_Cancer Dataset:")
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))*100))

Breast_Cancer Dataset:
Scores: [1.0, 1.0, 1.0, 1.0, 0.8571428571428571, 0.4642857142857143, 0.6428571428571429, 0.8214285714285714, 0.5, 0.4642857142857143]
Mean Accuracy: 77.500%


In [4]:
#CSE-6363-002-MACHINE LEARNING
#Gowtham Kumar Kanchi
#1002044003 



# Naive Bayes On The Iris Dataset
from csv import reader
from random import seed
from random import randrange
from math import sqrt
from math import exp
from math import pi
import csv
from collections import defaultdict
import numpy as np
from typing import List, Tuple
def load_data(filename):
    with open(filename, 'r') as file:
        return [row for row in csv.reader(file) if row]


def convert_column_to_float(data_list, col_idx):
    for sample in data_list:
        value_str = sample[col_idx].strip()
        sample[col_idx] = float(value_str)

def convert_column_to_integer(data, column_index):
    class_mapping = defaultdict(int)
    for sample in data:
        value = sample[column_index]
        if value not in class_mapping:
            class_mapping[value] = len(class_mapping)
        sample[column_index] = class_mapping[value]
    return dict(class_mapping)


def convert_col_to_int_car(dataset, col_idx):
    for row in dataset:
        value = row[col_idx].strip()
        if value == 'more' or value == '5more':
            row[col_idx] = 5
        else:
            row[col_idx] = int(value)
    return dataset




def split_data_into_folds(data, num_folds):
    np.random.shuffle(data)
    fold_size = len(data) // num_folds
    folds_list = [data[i*fold_size:(i+1)*fold_size] for i in range(num_folds)]
    return folds_list


def calculate_accuracy(true_labels, predicted_labels):
    num_correct = sum([1 for true, pred in zip(true_labels, predicted_labels) if true == pred])
    return num_correct / len(true_labels) * 100




def evaluate_algorithm(data: List[List], algorithm: callable, num_folds: int, *args) -> List[float]:
    def split_data_into_folds(data: List[List], num_folds: int) -> List[Tuple[List[List], List[List]]]:
        n_rows = len(data)
        fold_size = n_rows // num_folds
        folds = []
        for i in range(num_folds):
            start = i * fold_size
            end = start + fold_size
            fold = (data[:start] + data[end:], data[start:end])
            folds.append(fold)
        return folds

    def calculate_accuracy(actual: List, predicted: List) -> float:
        num_correct = sum(1 for a, p in zip(actual, predicted) if a == p)
        return num_correct / len(actual)

    scores = []
    for train_data, test_data in split_data_into_folds(data, num_folds):
        predicted = algorithm(train_data, test_data, *args)
        actual = [row[-1] for row in test_data]
        accuracy = calculate_accuracy(actual, predicted)
        scores.append(accuracy)
    return scores




# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
	separated = dict()
	for i in range(len(dataset)):
		vector = dataset[i]
		class_value = vector[-1]
		if (class_value not in separated):
			separated[class_value] = list()
		separated[class_value].append(vector)
	return separated

# Calculate the mean of a list of numbers
def calculate_mean(numbers):
	return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def calculate_std_dev(numbers):
    avg_num = calculate_mean(numbers)
    var_sum = sum([(x - avg_num) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return sqrt(var_sum)

# Calculate the mean, standard deviation and count for each column in a dataset
def summarize_data(dataset):
    data_summary = [(calculate_mean(column), calculate_std_dev(column), len(column)) for column in zip(*dataset)]
    del data_summary[-1]
    return data_summary

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
    class_dict = separate_by_class(dataset)
    class_summary = {}
    for class_val, rows in class_dict.items():
        class_summary[class_val] = summarize_data(rows)
    return class_summary

# Calculate the Gaussian probability distribution function for x
def calculate_prob_distribution(x, mean, std_dev):
    exponent = exp(-((x - mean) ** 2 / (2 * std_dev ** 2)))
    return (1 / (sqrt(2 * pi) * std_dev)) * exponent

def calculate_class_probabilities(class_summary, row):
    total_rows = 0
    class_count = {}
    for class_val in class_summary:
        class_count[class_val] = 0
        for i in range(len(class_summary[class_val])):
            class_count[class_val] += class_summary[class_val][i][2]
            total_rows += class_summary[class_val][i][2]
    class_probabilities = {}
    for class_val in class_summary:
        class_probability = 1
        for i in range(len(class_summary[class_val])):
            mean_val, std_dev_val, _ = class_summary[class_val][i]
            x = row[i]
            if std_dev_val == 0:
                if x != mean_val:
                    class_probability = 0
                    break
            else:
                numerator = 1
                denominator = sqrt(2*pi)*std_dev_val
                exponent = -1/2 * ((x-mean_val)/std_dev_val)**2
                probability_density = numerator/denominator * exp(exponent)
                class_probability *= probability_density
        class_probability *= class_count[class_val]/total_rows
        class_probabilities[class_val] = class_probability
    return class_probabilities



def predict_class(summaries, row):
    class_probabilities = calculate_class_probabilities(summaries, row)
    best_label = max(class_probabilities, key=class_probabilities.get)
    return best_label


def naive_bayes(train_set, test_set):
    summaries = summarize_by_class(train_set)
    predictions = [predict_class(summaries, row) for row in test_set]
    return predictions

seed(1)
filename = 'hayes_r.csv'
dataset = load_data(filename)


if filename == 'breast_cancer.csv':
    for column_index in range(len(dataset[0])):
        class_mapping = convert_column_to_integer(dataset, column_index)
if filename == 'car_evaluation.csv':
    for sample in dataset:
        del sample[5]
    for column_index in range(2, 4):
        convert_col_to_int_car(dataset, column_index)
    for column_index in range(len(dataset[0])):
        if column_index not in (2, 3):
            class_mapping = convert_column_to_integer(dataset, column_index)
if filename == 'hayes_r.csv':
    for column_index in range(len(dataset[0])-1):
        convert_column_to_float(dataset, column_index)
    class_mapping = convert_column_to_integer(dataset, len(dataset[0])-1)

# evaluate algorithm
n_folds = 10
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print("Hayes_r Dataset:")
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))*100))

Hayes_r Dataset:
Scores: [0.6923076923076923, 0.5384615384615384, 0.7692307692307693, 0.6153846153846154, 0.7692307692307693, 0.6153846153846154, 0.6923076923076923, 0.6923076923076923, 0.9230769230769231, 0.7692307692307693]
Mean Accuracy: 70.769%
