In [2]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [3]:
label_encoders = {}
scaler_params = {}
selected_features = []

In [4]:
def calculate_entropy(y):
        counts = Counter(y)
        probabilities = [count / len(y) for count in counts.values()]
        return -sum(p * math.log2(p) for p in probabilities if p > 0)

In [5]:
def calculate_information_gain(X, y, feature):
        total_entropy = calculate_entropy(y)
        values, counts = np.unique(X[feature], return_counts=True)
        weighted_entropy = 0
        for v, c in zip(values, counts):
            subset_y = y[X[feature] == v]
            weighted_entropy += (c / len(y)) * calculate_entropy(subset_y)
        
        return total_entropy - weighted_entropy

In [6]:
def calculate_split_info(X, feature):
        values, counts = np.unique(X[feature], return_counts=True)
        probabilities = counts / len(X)
        return -sum(p * math.log2(p) for p in probabilities if p > 0)

In [7]:
def calculate_gain_ratio(X, y, feature):
        ig = calculate_information_gain(X, y, feature)
        si = calculate_split_info(X, feature)
        return ig / si if si != 0 else 0

In [8]:
def select_features(X, y, k=10):
        gain_ratios = {}
        for feature in X.columns:
            gain_ratios[feature] = calculate_gain_ratio(X, y, feature)
        
        sorted_features = sorted(gain_ratios.items(), key=lambda x: x[1], reverse=True)
        for importance in sorted_features:
            print(f"Feature: {importance[0]}, Gain Ratio: {importance[1]}")
        selected_features = [f[0] for f in sorted_features[:k]]
        return selected_features

In [9]:
def encode_categorical(df, categorical_cols):
        for col in categorical_cols:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
        return df

In [10]:
def normalize_numerical(df, numerical_cols):
        for col in numerical_cols:
            mean = df[col].mean()
            std = df[col].std()
            df[col] = (df[col] - mean) / std
            scaler_params[col] = {'mean': mean, 'std': std}
        return df

In [11]:
def preprocess(df, categorical_cols, numerical_cols, target_col):
        df = encode_categorical(df, categorical_cols)
        
        df = normalize_numerical(df, numerical_cols)

        X = df.drop(target_col, axis=1)
        y = df[target_col]
        selected_features = select_features(X, y)
        
        return X[selected_features], y

In [12]:
columns = [
    'checking_account', 'duration', 'credit_history', 'purpose', 'credit_amount',
    'savings_account', 'employment', 'installment_rate', 'personal_status_sex',
    'other_debtors', 'residence_since', 'property', 'age', 'other_installment_plans',
    'housing', 'existing_credits', 'job', 'dependents', 'telephone', 'foreign_worker', 'class'
]

df = pd.read_csv('../dataset/german.txt', sep=' ', header=None, names=columns)

categorical_columns = [
    'checking_account', 'credit_history', 'purpose', 'savings_account',
    'employment', 'personal_status_sex', 'other_debtors', 'property',
    'other_installment_plans', 'housing', 'job', 'telephone', 'foreign_worker'
]

numerical_columns = [
    'duration', 'credit_amount', 'installment_rate', 'residence_since',
    'age', 'existing_credits', 'dependents'
]

In [13]:
X_processed, y_processed = preprocess(
    df, categorical_columns, numerical_columns, 'class')

Feature: credit_amount, Gain Ratio: 0.0840249180763252
Feature: checking_account, Gain Ratio: 0.05257301743857327
Feature: foreign_worker, Gain Ratio: 0.025498722895838353
Feature: credit_history, Gain Ratio: 0.025479578036756906
Feature: duration, Gain Ratio: 0.01682862262191917
Feature: savings_account, Gain Ratio: 0.016658196141477045
Feature: housing, Gain Ratio: 0.011196711794459116
Feature: other_installment_plans, Gain Ratio: 0.010506605048488504
Feature: purpose, Gain Ratio: 0.009335038667521351
Feature: other_debtors, Gain Ratio: 0.008908708970858076
Feature: property, Gain Ratio: 0.008720275014799172
Feature: age, Gain Ratio: 0.008650556005611994
Feature: employment, Gain Ratio: 0.006079416548390452
Feature: personal_status_sex, Gain Ratio: 0.004445227862814241
Feature: installment_rate, Gain Ratio: 0.002195825934842193
Feature: existing_credits, Gain Ratio: 0.0017436399350379077
Feature: telephone, Gain Ratio: 0.0009901541564031008
Feature: job, Gain Ratio: 0.000946194812203