<a href="https://colab.research.google.com/github/krishatuladhar/Data-Warehousing-and-Data-Mining/blob/main/lab3dwm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Lab 3
* Write a program to implement ID3.
 (use the attached laptop_buy_data.csv)
* Write a program to implement Naive Bayesian algorithm.
 (use the attached laptop_buy_data.csv)
* Write a to implement classification by backpropagation on following data.

       X1  X2   t
      -1  -1  -1
      -1   1   1
       1  -1   1
       1   1  -1





Qno 1:

In [1]:
import pandas as pd
import math
from collections import Counter

# Load data from a CSV file
def load_data(file_path):
    return pd.read_csv(file_path)

# Calculate entropy of the target attribute
def calculate_entropy(dataset, target):
    class_counts = Counter(dataset[target])
    entropy_value = 0.0
    for count in class_counts.values():
        probability = count / len(dataset)
        entropy_value -= probability * math.log2(probability)
    return entropy_value

# Compute information gain of a specific attribute
def compute_info_gain(dataset, attribute, target):
    base_entropy = calculate_entropy(dataset, target)
    values = dataset[attribute].unique()
    weighted_entropy = 0.0

    for value in values:
        subset = dataset[dataset[attribute] == value]
        weight = len(subset) / len(dataset)
        weighted_entropy += weight * calculate_entropy(subset, target)

    return base_entropy - weighted_entropy

# Select the attribute with the highest information gain
def select_best_attribute(dataset, attributes, target):
    best_gain = float('-inf')
    best_attribute = None

    for attribute in attributes:
        gain = compute_info_gain(dataset, attribute, target)
        if gain > best_gain:
            best_gain = gain
            best_attribute = attribute

    return best_attribute

# Return the most frequent class value
def get_majority_class(dataset, target):
    return dataset[target].mode()[0]

# ID3 algorithm implementation
def build_decision_tree(dataset, attributes, target):
    class_labels = dataset[target].unique()

    # Case 1: all instances belong to the same class
    if len(class_labels) == 1:
        return class_labels[0]

    # Case 2: no more attributes to split
    if not attributes:
        return get_majority_class(dataset, target)

    # Choose the attribute with the highest info gain
    best_attribute = select_best_attribute(dataset, attributes, target)
    tree = {best_attribute: {}}

    for value in dataset[best_attribute].unique():
        subset = dataset[dataset[best_attribute] == value]
        if subset.empty:
            tree[best_attribute][value] = get_majority_class(dataset, target)
        else:
            remaining_attributes = [attr for attr in attributes if attr != best_attribute]
            subtree = build_decision_tree(subset, remaining_attributes, target)
            tree[best_attribute][value] = subtree

    return tree

# Pretty-print the decision tree
def display_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "➤ " + str(tree))
        return
    for attribute, branches in tree.items():
        for value, subtree in branches.items():
            print(f"{indent}[{attribute} = {value}]")
            display_tree(subtree, indent + "  ")

# Main execution
if __name__ == "__main__":
    csv_path = "/content/drive/MyDrive/datawarehousinglab/laptop_buy_data.csv"
    data = load_data(csv_path)

    target_column = 'Class'  # Adjust if the target column has a different name
    attribute_list = [column for column in data.columns if column != target_column]

    decision_tree = build_decision_tree(data, attribute_list, target_column)
    print("Generated Decision Tree:")
    display_tree(decision_tree)


Generated Decision Tree:
[Age = Youth]
  [Student = Yes]
    ➤ Buy
  [Student = No]
    [Credit_Rating = Excellent]
      ➤ No
    [Credit_Rating = Fair]
      ➤ Buy
[Age = Middle_Aged]
  [Income = Low]
    [Credit_Rating = Excellent]
      ➤ No
    [Credit_Rating = Fair]
      [Student = Yes]
        ➤ Buy
  [Income = Medium]
    ➤ No
  [Income = High]
    ➤ No
[Age = Senior]
  [Credit_Rating = Fair]
    ➤ No
  [Credit_Rating = Excellent]
    [Income = Low]
      ➤ Buy
    [Income = High]
      [Student = No]
        ➤ Buy
    [Income = Medium]
      ➤ Buy


Qno 2:

In [3]:
import pandas as pd
from collections import defaultdict

class NaiveBayesClassifier:
    def __init__(self):
        self.class_probabilities = {}                 # Prior probabilities: P(Class)
        self.conditional_probabilities = defaultdict(dict)  # Conditional: P(Attribute=val | Class)

    def fit(self, data, target_attribute):
        self.attributes = [col for col in data.columns if col != target_attribute]
        total_records = len(data)

        # Calculate prior probabilities for each class
        class_counts = data[target_attribute].value_counts()
        self.class_probabilities = {
            label: count / total_records for label, count in class_counts.items()
        }

        # Compute conditional probabilities for each attribute value given class
        for attribute in self.attributes:
            unique_vals = data[attribute].unique()
            for label in class_counts.index:
                subset = data[data[target_attribute] == label]
                value_counts = subset[attribute].value_counts()
                total_in_class = len(subset)

                for val in unique_vals:
                    # Apply Laplace smoothing
                    count = value_counts.get(val, 0)
                    smoothed_prob = (count + 1) / (total_in_class + len(unique_vals))
                    self.conditional_probabilities[(attribute, val)][label] = smoothed_prob

    def predict(self, instance):
        class_scores = {}

        for label in self.class_probabilities:
            probability = self.class_probabilities[label]

            for attribute in self.attributes:
                value = instance.get(attribute)
                # Use a small fallback probability for unseen values
                conditional = self.conditional_probabilities.get((attribute, value), {}).get(label, 1e-6)
                probability *= conditional

            class_scores[label] = probability

        # Return the class with the highest posterior probability
        return max(class_scores, key=class_scores.get)

    def predict_all(self, dataframe):
        return [self.predict(row) for _, row in dataframe.iterrows()]


if __name__ == "__main__":
    # Load dataset
    filepath = "/content/drive/MyDrive/datawarehousinglab/laptop_buy_data.csv"
    df = pd.read_csv(filepath)

    # Train Naive Bayes model
    classifier = NaiveBayesClassifier()
    classifier.fit(df, target_attribute='Class')

    # Example test instance
    test_sample = {
        'Age': 'Senior',
        'Income': 'Medium',
        'Student': 'No',
        'Credit_Rating': 'Excellent'
    }

    result = classifier.predict(test_sample)
    print("Prediction for test instance:", result)


Prediction for test instance: No


Qno 3:

In [5]:
import numpy as np

# Sigmoid activation and its derivative
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

# Input data (4 samples, 2 features)
X = np.array([
    [-1, -1],
    [-1,  1],
    [ 1, -1],
    [ 1,  1]
])

# Target values reshaped to column vector
y = np.array([[-1], [1], [1], [-1]])

# Scale y to range [0, 1] to match sigmoid output
y_scaled = (y + 1) / 2

# Seed for reproducibility
np.random.seed(1)

# Initialize weights and biases
input_size = 2
hidden_size = 2
output_size = 1

# Weights
W1 = 2 * np.random.rand(input_size, hidden_size) - 1
b1 = np.zeros((1, hidden_size))

W2 = 2 * np.random.rand(hidden_size, output_size) - 1
b2 = np.zeros((1, output_size))

# Training parameters
epochs = 10000
learning_rate = 0.1

# Training loop
for epoch in range(epochs):
    # --- Forward Pass ---
    z1 = np.dot(X, W1) + b1
    a1 = sigmoid(z1)

    z2 = np.dot(a1, W2) + b2
    a2 = sigmoid(z2)

    # --- Backward Pass ---
    error = y_scaled - a2
    d_output = error * sigmoid_derivative(a2)

    d_hidden = d_output.dot(W2.T) * sigmoid_derivative(a1)

    # --- Weight Update ---
    W2 += a1.T.dot(d_output) * learning_rate
    b2 += np.sum(d_output, axis=0, keepdims=True) * learning_rate

    W1 += X.T.dot(d_hidden) * learning_rate
    b1 += np.sum(d_hidden, axis=0, keepdims=True) * learning_rate

# Final prediction
output = sigmoid(np.dot(sigmoid(np.dot(X, W1) + b1), W2) + b2)
predicted = (output > 0.5).astype(int)
true_label = (y_scaled > 0.5).astype(int)

print("Predicted outputs (0 or 1):\n", predicted)
print("Actual outputs:\n", true_label)


Predicted outputs (0 or 1):
 [[0]
 [1]
 [1]
 [0]]
Actual outputs:
 [[0]
 [1]
 [1]
 [0]]
