In [None]:

%pip install pandas numpy matplotlib

import numpy as np


# Function to calculate entropy
def entropy(labels):
    _, counts = np.unique(labels, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log2(probabilities))

# Training dataset (CreditScore and RiskLevel)
data = [
    (720, "Low"),
    (650, "High"),
    (750, "Low"),
    (600, "High"),
    (780, "Low"),
    (630, "High"),
    (710, "Low"),
    (640, "High"),
]

# Split data based on CreditScore <= 650
left_split = [risk for credit, risk in data if credit <= 650]
right_split = [risk for credit, risk in data if credit > 650]

# Calculate entropy before split
labels = [risk for _, risk in data]
H_before = entropy(labels)

# Calculate entropy after split
H_left = entropy(left_split)
H_right = entropy(right_split)

# Calculate weighted entropy
total = len(data)
H_after = (len(left_split) / total) * H_left + (len(right_split) / total) * H_right

# Compute Information Gain
IG = H_before - H_after

# Print results
print(f"Entropy before split: {H_before:.4f}")
print(f"Entropy after split: {H_after:.4f}")
print(f"Information Gain: {IG:.4f}")

# Decision: Is CreditScore=650 a good split?
if IG > 0.5:
    print(" CreditScore=650 is a good split!")
else:
    print(" Consider a different feature for splitting.")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ----- Question 2: Variance Reduction in Regression Trees -----
def variance_reduction(data, split_feature, target):
    parent_variance = np.var(data[target])
    left_split = data[data[split_feature] <= 35]
    right_split = data[data[split_feature] > 35]
    left_variance = np.var(left_split[target]) if len(left_split) > 0 else 0
    right_variance = np.var(right_split[target]) if len(right_split) > 0 else 0
    left_weight = len(left_split) / len(data)
    right_weight = len(right_split) / len(data)
    return parent_variance - (left_weight * left_variance + right_weight * right_variance)

data = pd.DataFrame({'Age': [25, 30, 35, 40, 45, 50], 'CreditScore': [600, 650, 700, 750, 800, 850]})
print("Question 2: Variance Reduction for Age=35:", variance_reduction(data, 'Age', 'CreditScore'))

# ----- Question 3: Probability Estimation for Missing Values -----
def estimate_probability(train_data, feature, target):
    return train_data[train_data[target] == 'High Risk'][feature].mean()

data = pd.DataFrame({'CreditScore': [600, 650, 700, 750, 800], 'Age': [25, 30, 35, 40, 45], 'RiskLevel': ['Low', 'Low', 'High', 'Low', 'High']})
print("Question 3: Estimated Probability for T2 being High Risk:", estimate_probability(data, 'CreditScore', 'RiskLevel'))

# ----- Question 4: Batch Gradient Descent -----
def gradient_descent(X, y, theta, alpha, iterations):
    m = len(y)
    theta = theta.astype(float)  # Convert to float
    for _ in range(iterations):
        predictions = X.dot(theta)
        errors = predictions - y
        gradient = (1/m) * X.T.dot(errors)
        theta -= alpha * gradient
    return theta

X = np.array([[1, 25], [1, 30], [1, 35], [1, 40]])
y = np.array([600, 650, 700, 750])
theta = np.array([500, 5], dtype=float)
theta_updated = gradient_descent(X, y, theta, 0.01, 1)
print("Question 4: Updated Parameters:", theta_updated)

# ----- Question 5: Normal Equation for Multiple Linear Regression -----
def normal_equation(X, y):
    return np.linalg.pinv(X.T.dot(X)).dot(X.T).dot(y)  # Use pseudoinverse

X = np.array([[1, 25, 2], [1, 30, 3], [1, 35, 4], [1, 40, 5]])
y = np.array([600, 650, 700, 750])
theta = normal_equation(X, y)
print("Question 4: Theta from Normal Equation:", theta)

# ----- Question 6: MSE and R² Calculation -----
def evaluate_model(X, y, theta):
    predictions = X.dot(theta)
    mse = np.mean((predictions - y) ** 2)
    r2 = 1 - (np.sum((y - predictions) ** 2) / np.sum((y - np.mean(y)) ** 2))
    return mse, r2

mse, r2 = evaluate_model(X, y, theta)
print("Question 6: Mean Squared Error:", mse, "R² Score:", r2)

# ----- Question 7: Logistic Regression Prediction and Cost Function -----
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_regression(X, weights):
    return sigmoid(np.dot(X, weights))

def logistic_cost(X, y, weights):
    m = len(y)
    predictions = logistic_regression(X, weights)
    return (-1/m) * np.sum(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))

X = np.array([1, 30, 600])
weights = np.array([0.5, -0.02, 0.01])
prediction = logistic_regression(X, weights)
cost = logistic_cost(np.array([[1, 30, 600]]), np.array([1]), weights)
print("Question 7: Logistic Regression Prediction:", prediction)
print("Question 7; Logistic Regression Cost Function Value:", cost)


Note: you may need to restart the kernel to use updated packages.
Entropy before split: 1.0000
Entropy after split: -0.0000
Information Gain: 1.0000
 CreditScore=650 is a good split!
Variance Reduction for Age=35: 5625.0
Estimated Probability for T2 being High Risk: nan
Updated Parameters: [500.125  10.625]
Theta from Normal Equation: [  36.85258964   30.87649402 -104.38247012]
Mean Squared Error: 1.8230285218844461e-23 R² Score: 1.0
Logistic Regression Prediction: 0.997268039236989
Logistic Regression Cost Function Value: 0.0027356993785360236
