In [7]:
import pandas as pd
import numpy as np

data_path = 'predicting lung cancer decision tree - there are five features.csv'
data = pd.read_csv(data_path)

data.head()

y = data['Lung Cancer']

# Define the calculate_entropy function 
def calculate_entropy(y):
    value_counts = y.value_counts()
    probabilities = value_counts / len(y)
    # Calculate entropy using the formula
    entropy = -sum(probabilities * np.log2(probabilities))
    return entropy

# Define the calculate_information_gain function
def calculate_information_gain(X, y, attribute):
    original_entropy = calculate_entropy(y)
    
    # Values of the attribute
    values = X[attribute].unique()
    
    # Calculate the weighted entropy after the split
    weighted_entropy_sum = 0
    for value in values:
        subset = y[X[attribute] == value]
        subset_entropy = calculate_entropy(subset)
        weighted_entropy_sum += (len(subset) / len(y)) * subset_entropy
    
    information_gain = original_entropy - weighted_entropy_sum
    return information_gain

# Extract the 'Lung Cancer' column as the target variable
y = data['Lung Cancer']

# Calculate information gain for specified attributes
information_gains = {attribute: calculate_information_gain(data, y, attribute) 
                     for attribute in ['tobacco smoking', 'chronic cough', 'radon exposure']}

information_gains


{'tobacco smoking': 0.2780719051126377,
 'chronic cough': 0.034851554559677034,
 'radon exposure': 0.2364527976600279}

In [8]:
import math

# Calculate areas for each class
area_A = (0.8 * 0.4) + (0.3 * 0.3)
area_B = (0.7 * 0.6) + (0.2 * 0.2)
area_C = (0.3 * 0.3) + (0.2 * 0.2)

# Calculate probabilities for each class
p_A = area_A
p_B = area_B
p_C = area_C

# Total area is 1 since it's a unit square
total_area = 1

# Normalize the probabilities 
p_A /= total_area
p_B /= total_area
p_C /= total_area

# Calculate the entropy for the overall data
entropy = - (p_A * math.log2(p_A) + p_B * math.log2(p_B) + p_C * math.log2(p_C))

entropy


1.4253642047367425

In [9]:
# Define the function to calculate entropy for a given probability distribution
def calculate_entropy(probabilities):
    return -sum(p * math.log2(p) for p in probabilities if p > 0)

# Split at x ≤ 0.2 (all figures have height 1)
# This split will cut through classes B and C, which have parts in the region (x <= 0.2)
area_B_x_le_02 = 0.2 * 1  # Rectangle of class B in the split region
area_C_x_le_02 = 0.2 * 0.2  # Square of class C in the split region
total_area_x_le_02 = 0.2  # Total area of the split region

# Calculate probabilities for each class in the split at x ≤ 0.2
p_B_x_le_02 = area_B_x_le_02 / total_area_x_le_02
p_C_x_le_02 = area_C_x_le_02 / total_area_x_le_02

# Entropy for the split at x ≤ 0.2
entropy_x_le_02 = calculate_entropy([p_B_x_le_02, p_C_x_le_02])

# Split at x ≤ 0.7
# This split will cut through class A
area_A_x_le_07 = 0.7 * 0.4  # Rectangle of class A in the split region
total_area_x_le_07 = 0.7  # Total area of the split region

# Calculate probabilities for each class in the split at x ≤ 0.7
p_A_x_le_07 = area_A_x_le_07 / total_area_x_le_07
p_B_x_le_07 = area_B / total_area_x_le_07
p_C_x_le_07 = area_C / total_area_x_le_07

# Entropy for the split at x ≤ 0.7
entropy_x_le_07 = calculate_entropy([p_A_x_le_07, p_B_x_le_07, p_C_x_le_07])

# Split at y ≤ 0.6
# This split will cut through classes A and B
area_B_y_le_06 = 0.7 * 0.6  # Rectangle of class B in the split region
area_A_y_le_06 = 0.8 * (1 - 0.6)  # Rectangle of class A in the split region
total_area_y_le_06 = 1 * 0.6  # Total area of the split region

# Calculate probabilities for each class in the split at y ≤ 0.6
p_A_y_le_06 = area_A_y_le_06 / total_area_y_le_06
p_B_y_le_06 = area_B_y_le_06 / total_area_y_le_06
p_C_y_le_06 = area_C / total_area_y_le_06

# Entropy for the split at y ≤ 0.6
entropy_y_le_06 = calculate_entropy([p_A_y_le_06, p_B_y_le_06, p_C_y_le_06])

entropy_x_le_02, entropy_x_le_07, entropy_y_le_06


(0.46438561897747255, 1.3778874048877483, 1.3219405620899831)

In [17]:
# The entropy of the original set was calculated previously
entropy_original = entropy

# Proportion of samples in each subset for the split at x ≤ 0.2
prop_x_le_02 = 0.2
prop_x_gt_02 = 1 - prop_x_le_02

# Weighted average entropy for the split at x ≤ 0.2
# Note: The entropy for the region x > 0.2 can be considered as the remaining entropy after subtracting the entropy for x ≤ 0.2
weighted_entropy_x_le_02 = prop_x_le_02 * entropy_x_le_02 + prop_x_gt_02 * (entropy_original - entropy_x_le_02)

# Information Gain for the split at x ≤ 0.2
info_gain_x_le_02 = entropy_original - weighted_entropy_x_le_02

# Proportion of samples in each subset for the split at x ≤ 0.7
prop_x_le_07 = 0.7
prop_x_gt_07 = 1 - prop_x_le_07

# Since the split at x ≤ 0.7 involves the whole class B and C, the entropy for x > 0.7 is just the entropy for the remaining part of class A
area_A_x_gt_07 = 0.3 * 0.4
p_A_x_gt_07 = area_A_x_gt_07 / (1 - prop_x_le_07)
entropy_x_gt_07 = calculate_entropy([p_A_x_gt_07])

# Weighted average entropy for the split at x ≤ 0.7
weighted_entropy_x_le_07 = prop_x_le_07 * entropy_x_le_07 + prop_x_gt_07 * entropy_x_gt_07

# Information Gain for the split at x ≤ 0.7
info_gain_x_le_07 = entropy_original - weighted_entropy_x_le_07

# Proportion of samples in each subset for the split at y ≤ 0.6
prop_y_le_06 = 0.6
prop_y_gt_06 = 1 - prop_y_le_06

# Since the split at y ≤ 0.6 involves the whole class C, the entropy for y > 0.6 is just the entropy for the remaining parts of classes A and B
area_A_y_gt_06 = 0.8 * 0.4
area_B_y_gt_06 = 0.7 * (1 - 0.6)
p_A_y_gt_06 = area_A_y_gt_06 / (1 - prop_y_le_06)
p_B_y_gt_06 = area_B_y_gt_06 / (1 - prop_y_le_06)
entropy_y_gt_06 = calculate_entropy([p_A_y_gt_06, p_B_y_gt_06])

# Weighted average entropy for the split at y ≤ 0.6
weighted_entropy_y_le_06 = prop_y_le_06 * entropy_y_le_06 + prop_y_gt_06 * entropy_y_gt_06

# Information Gain for the split at y ≤ 0.6
info_gain_y_le_06 = entropy_original - weighted_entropy_y_le_06

info_gain_x_le_02, info_gain_x_le_07, info_gain_y_le_06


(0.5637042123338319, 0.3022116499288352, 0.3851023887264644)

In [12]:
import pandas as pd

# Load the CSV data into a DataFrame
csv_path = 'question3 gini table - Sheet1.csv'
data = pd.read_csv(csv_path)

# Display the first few rows of the dataframe to ensure it's loaded correctly
data.head()

# Calculate the Gini index for the 'class' column

# Count the occurrences of each class
class_counts = data['class'].value_counts()

# Calculate the proportion of each class
proportions = class_counts / class_counts.sum()

# Calculate the Gini index
gini_index = 1 - sum(proportions**2)
gini_index




0.5

In [13]:

gender_class_counts = data.groupby(['Gender', 'class']).size().unstack(fill_value=0)

# Calculate the proportion of classes within each gender group
gender_proportions = gender_class_counts.divide(gender_class_counts.sum(axis=1), axis=0)

# Calculate the Gini index for each gender group
gender_gini = 1 - (gender_proportions**2).sum(axis=1)

# Calculate the weighted average Gini index for the 'Gender' attribute
weights = gender_class_counts.sum(axis=1) / data.shape[0]
gini_index_gender = (gender_gini * weights).sum()
gini_index_gender, gender_gini, weights


(0.48,
 Gender
 f    0.48
 m    0.48
 dtype: float64,
 Gender
 f    0.5
 m    0.5
 dtype: float64)

In [14]:
# Calculate the Gini index for the 'Car Type' attribute with a multiway split

# First, group the data by 'Car Type' and then by 'Class' to count occurrences
car_type_class_counts = data.groupby(['car type', 'class']).size().unstack(fill_value=0)

# Calculate the proportion of classes within each car type group
car_type_proportions = car_type_class_counts.divide(car_type_class_counts.sum(axis=1), axis=0)

# Calculate the Gini index for each car type group
car_type_gini = 1 - (car_type_proportions**2).sum(axis=1)

# Calculate the weighted average Gini index for the 'Car Type' attribute
weights_car_type = car_type_class_counts.sum(axis=1) / data.shape[0]
gini_index_car_type = (car_type_gini * weights_car_type).sum()
gini_index_car_type, car_type_gini, weights_car_type


(0.16250000000000003,
 car type
 Family    0.37500
 Luxury    0.21875
 Sports    0.00000
 dtype: float64,
 car type
 Family    0.2
 Luxury    0.4
 Sports    0.4
 dtype: float64)

In [15]:
# Calculate the Gini index for the 'Shirt Size' attribute with a multiway split

# First, group the data by 'Shirt Size' and then by 'Class' to count occurrences
shirt_size_class_counts = data.groupby(['shirt size', 'class']).size().unstack(fill_value=0)

# Calculate the proportion of classes within each shirt size group
shirt_size_proportions = shirt_size_class_counts.divide(shirt_size_class_counts.sum(axis=1), axis=0)

# Calculate the Gini index for each shirt size group
shirt_size_gini = 1 - (shirt_size_proportions**2).sum(axis=1)

# Calculate the weighted average Gini index for the 'Shirt Size' attribute
weights_shirt_size = shirt_size_class_counts.sum(axis=1) / data.shape[0]
gini_index_shirt_size = (shirt_size_gini * weights_shirt_size).sum()
gini_index_shirt_size, shirt_size_gini, weights_shirt_size


(0.49142857142857144,
 shirt size
 extra large    0.500000
 large          0.500000
 medium         0.489796
 small          0.480000
 dtype: float64,
 shirt size
 extra large    0.20
 large          0.20
 medium         0.35
 small          0.25
 dtype: float64)

In [16]:
# Width and height of the rectangle where misclassifications occur
width_misclass = 0.5 - 0.2
height_misclass = 1.0 - 0.7

# Calculate the area of misclassification
area_misclass = width_misclass * height_misclass

# Calculate the expected error rate (total area is 1 for a unit square)
expected_error_rate = area_misclass / 1
expected_error_rate


0.09000000000000001

In [1]:
import math

# Initial probabilities for positive and negative classes
p_positive = 10 / 20
p_negative = 10 / 20

# Calculate initial entropy of S
entropy_s = -(p_positive * math.log2(p_positive) + p_negative * math.log2(p_negative))

# Information gain for ID (since the entropy of each split is 0)
information_gain_id = entropy_s - 0

entropy_s, information_gain_id


(1.0, 1.0)

In [2]:
# Proportions for left-handed and right-handed subsets
p_left_positive = 9 / 10
p_left_negative = 1 / 10
p_right_positive = 1 / 10
p_right_negative = 9 / 10

# Calculate entropies for left-handed and right-handed subsets
entropy_left = -(p_left_positive * math.log2(p_left_positive) + p_left_negative * math.log2(p_left_negative))
entropy_right = -(p_right_positive * math.log2(p_right_positive) + p_right_negative * math.log2(p_right_negative))

# Calculate the weighted sum of these entropies
weighted_entropy = (10 / 20) * entropy_left + (10 / 20) * entropy_right

# Information gain for Handedness
information_gain_handedness = entropy_s - weighted_entropy

entropy_left, entropy_right, weighted_entropy, information_gain_handedness


(0.4689955935892812,
 0.4689955935892812,
 0.4689955935892812,
 0.5310044064107188)

In [3]:
# Number of partitions (each instance forms its own partition)
k = 20

# Calculate Split Information for ID
split_information_id = -sum([(1/k) * math.log2(1/k) for _ in range(k)])

# Calculate Gain Ratio for ID
gain_ratio_id = information_gain_id / split_information_id

split_information_id, gain_ratio_id


(4.321928094887363, 0.23137821315975915)