In [1]:
import numpy as np
import pandas as pd

# 1

In [2]:
df = {'Day':[1,2,3,4,5,6,7,8,9,10,11,12,13,14],
        'Outlook':['Sunny','Sunny','Overcast','Rain','Rain','Rain','Overcast','Sunny','Sunny','Rain','Sunny','Overcast','Overcast','Rain'],
       'Temp':[85,80,83,70,68,65,64,72,69,75,75,72,81,71],
        'Humidity':[85,90,78,96,80,70,65,95,70,80,70,90,75,80],
       'Wind':['Weak','Strong','Weak','Weak','Weak','Strong','Strong','Weak','Weak','Weak','Strong','Strong','Weak','Strong'],
       'Decision':['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']}
df = pd.DataFrame(df)
print(df)

    Day   Outlook  Temp  Humidity    Wind Decision
0     1     Sunny    85        85    Weak       No
1     2     Sunny    80        90  Strong       No
2     3  Overcast    83        78    Weak      Yes
3     4      Rain    70        96    Weak      Yes
4     5      Rain    68        80    Weak      Yes
5     6      Rain    65        70  Strong       No
6     7  Overcast    64        65  Strong      Yes
7     8     Sunny    72        95    Weak       No
8     9     Sunny    69        70    Weak      Yes
9    10      Rain    75        80    Weak      Yes
10   11     Sunny    75        70  Strong      Yes
11   12  Overcast    72        90  Strong      Yes
12   13  Overcast    81        75    Weak      Yes
13   14      Rain    71        80  Strong       No


In [3]:
def entropy(y):
    label_counts = y.value_counts()
    probabilities = label_counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities + 1e-9))

In [4]:
def information_gain(X, y, feature):
    total_entropy = entropy(y)
    
    feature_values = X[feature].unique()
    
    weighted_entropy = 0
    for value in feature_values:
        subset_y = y[X[feature] == value]
        weighted_entropy += (len(subset_y) / len(y)) * entropy(subset_y)
    
    return total_entropy - weighted_entropy

In [5]:
def best_feature_to_split(X, y):
    """Find the best feature to split on based on information gain."""
    gains = {feature: information_gain(X, y, feature) for feature in X.columns}
    return max(gains, key=gains.get)

In [6]:
def build_tree(X, y, features):
    if len(y.unique()) == 1:
        return y.iloc[0]
    
    if not features:
        return y.mode()[0]
    
    best_feature = best_feature_to_split(X, y)
    
    tree = {best_feature: {}}
    
    for value in X[best_feature].unique():
        subset_X = X[X[best_feature] == value]
        subset_y = y[X[best_feature] == value]
        
        subtree = build_tree(subset_X, subset_y, features - {best_feature})
        tree[best_feature][value] = subtree
    
    return tree

In [7]:
def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree  
    feature = next(iter(tree))  
    feature_value = sample[feature]
    
    if feature_value in tree[feature]:
        return classify(tree[feature][feature_value], sample)
    else:
        return None 

In [8]:
features = list(df.columns.difference(['Day', 'Decision']))

decision_tree = build_tree(df[features], df['Decision'], set(features))

new_sample = {'Outlook': 'Sunny', 'Temp': 75, 'Humidity': 70, 'Wind': 'Weak'}
classification = classify(decision_tree, new_sample)

print("Decision Tree:", decision_tree)
print("Classification for new sample:", classification)


Decision Tree: {'Temp': {85: 'No', 80: 'No', 83: 'Yes', 70: 'Yes', 68: 'Yes', 65: 'No', 64: 'Yes', 72: {'Humidity': {95: 'No', 90: 'Yes'}}, 69: 'Yes', 75: 'Yes', 81: 'Yes', 71: 'No'}}
Classification for new sample: Yes


# 2

In [9]:
def gini_impurity(y):
    """Calculate the Gini impurity of the labels."""
    label_counts = y.value_counts()
    probabilities = label_counts / len(y)
    return 1 - np.sum(probabilities ** 2)

In [10]:
def information_gain_gini(X, y, feature):
    """Calculate the Gini information gain of a feature."""
    total_gini = gini_impurity(y)
    feature_values = X[feature].unique()
    
    weighted_gini = 0
    for value in feature_values:
        subset_y = y[X[feature] == value]
        weighted_gini += (len(subset_y) / len(y)) * gini_impurity(subset_y)
    
    return total_gini - weighted_gini

In [11]:
def best_feature_to_split(X, y):
    """Find the best feature to split on based on Gini information gain."""
    gains = {feature: information_gain_gini(X, y, feature) for feature in X.columns}
    return max(gains, key=gains.get)

In [12]:
def build_tree(X, y, features):
    if len(y.unique()) == 1:
        return y.iloc[0]
    
    if not features:
        return y.mode()[0]
    
    best_feature = best_feature_to_split(X, y)
    
    tree = {best_feature: {}}
    
    for value in X[best_feature].unique():
        subset_X = X[X[best_feature] == value]
        subset_y = y[X[best_feature] == value]
        
        subtree = build_tree(subset_X, subset_y, features.difference([best_feature]))
        tree[best_feature][value] = subtree
    
    return tree

In [13]:
def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree  
    
    feature = next(iter(tree))  
    feature_value = sample[feature]
    
    if feature_value in tree[feature]:
        return classify(tree[feature][feature_value], sample)
    else:
        return None

In [14]:
features = list(df.columns.difference(['Day', 'Decision']))

decision_tree = build_tree(df[features], df['Decision'], set(features))

new_sample = {'Outlook': 'Sunny', 'Temp': 75, 'Humidity': 70, 'Wind': 'Weak'}
classification = classify(decision_tree, new_sample)

print("Decision Tree:", decision_tree)
print("Classification for new sample:", classification)

Decision Tree: {'Temp': {85: 'No', 80: 'No', 83: 'Yes', 70: 'Yes', 68: 'Yes', 65: 'No', 64: 'Yes', 72: {'Humidity': {95: 'No', 90: 'Yes'}}, 69: 'Yes', 75: 'Yes', 81: 'Yes', 71: 'No'}}
Classification for new sample: Yes


# 3

In [15]:
data = {'Income':['Low','Low','Medium','Medium','High','High'],
       'Credit':['Good','Bad','Good','Bad','Good','Bad'],
       'Loan Approved':['Yes','No','Yes','Yes','Yes','No']}
df = pd.DataFrame(data)

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [17]:
df['Income'] = df['Income'].map({'Low': 0, 'Medium': 1, 'High': 2})
df['Credit'] = df['Credit'].map({'Good': 0, 'Bad': 1})
df['Loan Approved'] = df['Loan Approved'].map({'Yes': 1, 'No': 0})

X = df[['Income', 'Credit']]
y = df['Loan Approved']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cart_classifier = DecisionTreeClassifier(criterion='gini')  

cart_classifier.fit(X_train, y_train)

predictions = cart_classifier.predict(X_test)

print("Predictions:", predictions)

accuracy = cart_classifier.score(X_test, y_test)
print("Accuracy:", accuracy)


Predictions: [1 1]
Accuracy: 0.5
