### Import Libraries

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (10, 8)
import seaborn as sns
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from ipywidgets import Image
from io import StringIO
import pydotplus

In [None]:
def create_df(dic, feature_list):
    out = pd.DataFrame(dic)
    out = pd.concat([out, pd.get_dummies(out[feature_list])], axis = 1)
    out.drop(feature_list, axis = 1, inplace = True)
    return out

# Some feature values are present in train and absent in test and vice-versa.
def intersect_features(train, test):
    common_feat = list( set(train.keys()) & set(test.keys()))
    return train[common_feat], test[common_feat]

In [None]:
features = ['Looks', 'Alcoholic_beverage','Eloquence','Money_spent']

In [None]:
df_train = {}
df_train['Looks'] = ['handsome', 'handsome', 'handsome', 'repulsive',
                         'repulsive', 'repulsive', 'handsome']
df_train['Alcoholic_beverage'] = ['yes', 'yes', 'no', 'no', 'yes', 'yes', 'yes']
df_train['Eloquence'] = ['high', 'low', 'average', 'average', 'low',
                                   'high', 'average']
df_train['Money_spent'] = ['lots', 'little', 'lots', 'little', 'lots',
                                  'lots', 'lots']
df_train['Will_go'] = LabelEncoder().fit_transform(['+', '-', '+', '-', '-', '+', '+'])

df_train = create_df(df_train, features)
df_train

Unnamed: 0,Will_go,Looks_handsome,Looks_repulsive,Alcoholic_beverage_no,Alcoholic_beverage_yes,Eloquence_average,Eloquence_high,Eloquence_low,Money_spent_little,Money_spent_lots
0,0,True,False,False,True,False,True,False,False,True
1,1,True,False,False,True,False,False,True,True,False
2,0,True,False,True,False,True,False,False,False,True
3,1,False,True,True,False,True,False,False,True,False
4,1,False,True,False,True,False,False,True,False,True
5,0,False,True,False,True,False,True,False,False,True
6,0,True,False,False,True,True,False,False,False,True


In [None]:
df_test = {}
df_test['Looks'] = ['handsome', 'handsome', 'repulsive']
df_test['Alcoholic_beverage'] = ['no', 'yes', 'yes']
df_test['Eloquence'] = ['average', 'high', 'average']
df_test['Money_spent'] = ['lots', 'little', 'lots']
df_test = create_df(df_test, features)
df_test

Unnamed: 0,Looks_handsome,Looks_repulsive,Alcoholic_beverage_no,Alcoholic_beverage_yes,Eloquence_average,Eloquence_high,Money_spent_little,Money_spent_lots
0,True,False,True,False,True,False,False,True
1,True,False,False,True,False,True,True,False
2,False,True,False,True,True,False,False,True


In [None]:
y = df_train['Will_go']
df_train, df_test = intersect_features(train=df_train, test=df_test)
df_train

Unnamed: 0,Alcoholic_beverage_no,Looks_handsome,Eloquence_high,Looks_repulsive,Alcoholic_beverage_yes,Money_spent_little,Money_spent_lots,Eloquence_average
0,False,True,True,False,True,False,True,False
1,False,True,False,False,True,True,False,False
2,True,True,False,False,False,False,True,True
3,True,False,False,True,False,True,False,True
4,False,False,False,True,True,False,True,False
5,False,False,True,True,True,False,True,False
6,False,True,False,False,True,False,True,True


In [None]:
df_test

Unnamed: 0,Alcoholic_beverage_no,Looks_handsome,Eloquence_high,Looks_repulsive,Alcoholic_beverage_yes,Money_spent_little,Money_spent_lots,Eloquence_average
0,True,True,False,False,False,False,True,True
1,False,True,True,False,True,True,False,False
2,False,False,False,True,True,False,True,True


In [None]:
def entropy_S0():
    p0 = 3 / 7
    p1 = 4 / 7
    S0 = -(p0 * math.log2(p0) + p1 * math.log2(p1))
    return S0

S0 = entropy_S0()
print("Entropy of the initial system S0:", S0)

Entropy of the initial system S0: 0.9852281360342515


In [None]:
def entropy(p0, p1):
    return -(p0 * math.log2(p0) + p1 * math.log2(p1))

def information_gain():
    p0_initial = 3 / 7
    p1_initial = 4 / 7
    S0 = entropy(p0_initial, p1_initial)

    p0_left = 1 / 4
    p1_left = 3 / 4
    S1 = entropy(p0_left, p1_left)

    p0_right = 2 / 3
    p1_right = 1 / 3
    S2 = entropy(p0_right, p1_right)
    IG = S0 - (4/7 * S1) - (3/7 * S2)

    return S1, S2, IG
S1, S2, IG = information_gain()
print(f"Entropy S1: {S1}")
print(f"Entropy S2: {S2}")
print(f"Information Gain (IG): {IG}")

Entropy S1: 0.8112781244591328
Entropy S2: 0.9182958340544896
Information Gain (IG): 0.12808527889139443


In [None]:
def entropy(a_list):
    value_counts = {}
    for item in a_list:
        if item in value_counts:
            value_counts[item] += 1
        else:
            value_counts[item] = 1


    total_count = len(a_list)
    entropy_value = 0

    for count in value_counts.values():
        probability = count / total_count
        entropy_value -= probability * math.log2(probability)

    return entropy_value

balls = ['blue'] * 9 + ['yellow'] * 11
balls_left = ['blue'] * 8 + ['yellow'] * 5
balls_right = ['blue'] * 1 + ['yellow'] * 6


print(entropy(balls))
print(entropy(balls_left))
print(entropy(balls_right))
print(entropy([1, 2, 3, 4, 5, 6]))

0.9927744539878083
0.9612366047228759
0.5916727785823275
2.584962500721156


In [None]:
def entropy(a_list):
    value_counts = {}
    for item in a_list:
        if item in value_counts:
            value_counts[item] += 1
        else:
            value_counts[item] = 1

    total_count = len(a_list)
    entropy_value = 0

    for count in value_counts.values():
        probability = count / total_count
        entropy_value -= probability * math.log2(probability)

    return entropy_value

def fair_die_entropy():
    p = 1 / 6
    return -6 * p * math.log2(p)

balls_left = ['blue'] * 8 + ['yellow'] * 5
print("Entropy of balls_left:", entropy(balls_left))

print("Entropy of a fair die:", fair_die_entropy())

Entropy of balls_left: 0.9612366047228759
Entropy of a fair die: 2.584962500721156


In [None]:
def entropy(a_list):
    value_counts = {}
    for item in a_list:
        if item in value_counts:
            value_counts[item] += 1
        else:
            value_counts[item] = 1

    total_count = len(a_list)
    entropy_value = 0

    for count in value_counts.values():
        probability = count / total_count
        entropy_value -= probability * math.log2(probability)

    return entropy_value

def information_gain(root, left, right):
    H_root = entropy(root)
    H_left = entropy(left)
    H_right = entropy(right)
    total_size = len(root)
    left_size = len(left)
    right_size = len(right)

    IG = H_root - (left_size / total_size * H_left + right_size / total_size * H_right)
    return IG

balls_left = ['blue'] * 8 + ['yellow'] * 5
balls_right = ['blue'] * 1 + ['yellow'] * 6
root = ['blue'] * 9 + ['yellow'] * 11

IG = information_gain(root, balls_left, balls_right)
print("Information Gain:", IG)

Information Gain: 0.16088518841412436


In [None]:
from math import log

# Entropy function as given in the example
def entropy(a_list):
    lst = list(a_list)
    size = len(lst)
    entropy = 0
    set_elements = len(set(lst))
    if set_elements in [0, 1]:
        return 0
    for i in set(lst):
        occ = lst.count(i)
        entropy -= occ / size * log(occ / size, 2)
    return entropy

# Information gain calculation as given in the example
def information_gain(root, left, right):
    return entropy(root) - (len(left) / len(root)) * entropy(left) - (len(right) / len(root)) * entropy(right)

# Calculate information gains for all features
def information_gains(X, y):
    gains = []
    for feature in X.columns:
        x_left = y[X[feature] == 0]
        x_right = y[X[feature] == 1]
        gains.append(information_gain(y, x_left, x_right))
    return gains

# Modified function to find the best feature without recursion
def best_split_once(X, y, feature_names):
    clf = information_gains(X, y)
    best_feat_id = clf.index(max(clf))

    best_feature = feature_names[best_feat_id]

    print(f'Best feature to split: {best_feature}')

    # Split the data based on the best feature
    x_left = X[X.iloc[:, best_feat_id] == 0]
    x_right = X[X.iloc[:, best_feat_id] == 1]
    y_left = y[X.iloc[:, best_feat_id] == 0]
    y_right = y[X.iloc[:, best_feat_id] == 1]

    entropy_left = entropy(y_left)
    entropy_right = entropy(y_right)

    print(f'Samples: {len(x_left)} (left) and {len(x_right)} (right)')
    print(f'Entropy: {entropy_left} (left) and {entropy_right} (right)')
    print('_' * 30 + '\n')

# Example usage (assuming df_train and y are defined as features and target):
best_split_once(df_train, y, df_train.columns)


Best feature to split: Eloquence_high
Samples: 5 (left) and 4 (right)
Entropy: 0.7219280948873623 (left) and 0 (right)
______________________________

