In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator
from sklearn.impute import SimpleImputer
# knn
from sklearn.neighbors import KNeighborsClassifier
# linear models
from sklearn.linear_model import LogisticRegression
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle, FancyBboxPatch, FancyArrowPatch
import matplotlib.patches as mpatches

## adding data

In [3]:
df = pd.read_csv('./bank.csv')
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue-collar,single,primary,no,1,yes,no,cellular,20,apr,257,1,-1,0,unknown,no
11158,39,services,married,secondary,no,733,no,no,unknown,16,jun,83,4,-1,0,unknown,no
11159,32,technician,single,secondary,no,29,no,no,cellular,19,aug,156,2,-1,0,unknown,no
11160,43,technician,married,secondary,no,0,no,yes,cellular,8,may,9,2,172,5,failure,no


## getting html report

In [4]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Profiling Report")
profile.to_file("report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 17/17 [00:00<00:00, 46.03it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## splitting

In [5]:
y = df['deposit']
X = df.drop('deposit', axis=1)

random_state = 999
test_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [6]:
NUMERICAL = {'age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'}
CATEGORICAL = set(X.columns) - NUMERICAL
CATEGORICAL, NUMERICAL = list(CATEGORICAL), list(NUMERICAL)

## Preprocessing


In [7]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  #we dont have missing value but for sure :)
    ('scaler', MinMaxScaler())                 
])

This pipeline fills missing categorical values with the most frequent category, then applies One-Hot encoding to produce binary indicator vectors for each category.


normalization Formula:  $x_{scaled} = \frac{x - x_{min}}{x_{max} - x_{min}}$


In [8]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),  #we dont have missing value but for sure :)
    ('onehot', OneHotEncoder(handle_unknown='ignore'))                    
])

This pipeline fills missing categorical values with the most frequent category, then applies One-Hot encoding to produce binary indicator vectors for each category.

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, NUMERICAL),
        ('cat', categorical_transformer, CATEGORICAL)
    ]
)

The `ColumnTransformer` splits the dataset’s features into two parallel paths—numerical and categorical—and applies different preprocessing pipelines to each group.

*   **Numerical columns** are processed by `numerical_transformer` (mean imputation + scaling).
*   **Categorical columns** are processed by `categorical_transformer` (most‑frequent imputation + one‑hot encoding).

After these transformations, the outputs from both paths are concatenated into a single unified feature matrix for model training.


## requiered functions for handmade Dtree (entropy & information gain)

Entropy = $H(S) = - \sum_{i=1}^{k} p_i \log_2(p_i)$


Information Gain = $IG(S, A) = H(S) - \frac{|S_L|}{|S|}H(S_L) - \frac{|S_R|}{|S|}H(S_R)$

In [41]:
import numpy as np

def entropy(y):
    """Shannon entropy H(S) = -Σ p_i log2(p_i)."""
    if len(y) == 0:
        return 0
    _, counts = np.unique(y, return_counts=True)
    p = counts / len(y)
    p = p[p > 0]  # avoid log2(0)
    return -np.sum(p * np.log2(p))


def information_gain(y, y_left, y_right):
    """Information Gain = H(parent) - weighted avg entropy of children."""
    n = len(y)
    if n == 0:
        return 0
    H_parent = entropy(y)
    H_left = entropy(y_left)
    H_right = entropy(y_right)
    w_left = len(y_left) / n
    w_right = len(y_right) / n
    return H_parent - (w_left * H_left + w_right * H_right)


## handmade decision tree 

In [43]:
from sklearn.base import BaseEstimator, ClassifierMixin

class HandmadeDecisionTree(BaseEstimator, ClassifierMixin):
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.random_state = random_state
        self.tree = None
        self.classes_ = None

    def _find_best_split(self, X, y):
        best_gain = -1
        best_feature = None
        best_threshold = None
        n_features = X.shape[1]
        if self.random_state is not None:
            np.random.seed(self.random_state)
            feature_indices = np.random.permutation(n_features)
        else:
            feature_indices = range(n_features)

        for feature_idx in feature_indices:
            feature_values = X[:, feature_idx]
            unique_values = np.unique(feature_values)
            for threshold in unique_values:
                left_mask = feature_values <= threshold
                right_mask = ~left_mask
                if np.sum(left_mask) < self.min_samples_leaf or np.sum(right_mask) < self.min_samples_leaf:
                    continue
                y_left, y_right = y[left_mask], y[right_mask]
                gain = information_gain(y, y_left, y_right)  # ✅ استفاده از تابع بالا
                if gain > best_gain:
                    best_gain, best_feature, best_threshold = gain, feature_idx, threshold
        return best_feature, best_threshold, best_gain

    def _build_tree(self, X, y, depth=0):
        n_samples, n_classes = len(y), len(np.unique(y))
        if (self.max_depth is not None and depth >= self.max_depth) or n_samples < self.min_samples_split or n_classes == 1:
            class_counts = np.bincount(y, minlength=len(self.classes_))
            return {'is_leaf': True, 'class': np.argmax(class_counts), 'samples': n_samples}

        best_feature, best_threshold, best_gain = self._find_best_split(X, y)
        if best_feature is None or best_gain <= 0:
            class_counts = np.bincount(y, minlength=len(self.classes_))
            return {'is_leaf': True, 'class': np.argmax(class_counts), 'samples': n_samples}

        left_mask = X[:, best_feature] <= best_threshold
        right_mask = ~left_mask
        left_tree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_tree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return {
            'is_leaf': False,
            'feature': best_feature,
            'threshold': best_threshold,
            'gain': best_gain,
            'left': left_tree,
            'right': right_tree,
            'samples': n_samples
        }

    def _predict_sample(self, sample, node):
        if node['is_leaf']:
            return node['class']
        if sample[node['feature']] <= node['threshold']:
            return self._predict_sample(sample, node['left'])
        else:
            return self._predict_sample(sample, node['right'])

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        self.classes_ = np.unique(y)
        y_encoded = np.searchsorted(self.classes_, y)
        self.tree = self._build_tree(X, y_encoded)
        return self

    def predict(self, X):
        if self.tree is None:
            raise ValueError("Tree not trained yet. Call fit() first.")
        X = np.array(X)
        preds = [self.classes_[self._predict_sample(sample, self.tree)] for sample in X]
        return np.array(preds)

    def get_params(self, deep=True):
        return {
            "max_depth": self.max_depth,
            "min_samples_split": self.min_samples_split,
            "min_samples_leaf": self.min_samples_leaf,
            "random_state": self.random_state,
        }

    def set_params(self, **params):
        for k, v in params.items():
            setattr(self, k, v)
        return self


In [44]:
handmade_dt_model = HandmadeDecisionTree(max_depth=10, min_samples_split=5, random_state=random_state)

handmade_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', handmade_dt_model)
])

handmade_pipeline.fit(X_train, y_train)
y_pred_handmade = handmade_pipeline.predict(X_test)

print("Handmade Decision Tree Results:")
print(classification_report(y_test, y_pred_handmade, digits=4))


Handmade Decision Tree Results:
              precision    recall  f1-score   support

          no     0.8736    0.7946    0.8322      1183
         yes     0.7900    0.8705    0.8283      1050

    accuracy                         0.8303      2233
   macro avg     0.8318    0.8325    0.8303      2233
weighted avg     0.8343    0.8303    0.8304      2233



## Decision Tree Structure Visualization


In [49]:
def print_tree(node, feature_names, depth=0):
    prefix = "│   " * depth
    if node["is_leaf"]:
        print(f"{prefix}└── Leaf → class = {node['class']}  (samples = {node['samples']})")
    else:
        feature = feature_names[node["feature"]]
        threshold = node["threshold"]
        gain = node["gain"]

        if isinstance(threshold, (float, int)):
            threshold_str = f"{threshold:.4f}"
        else:
            threshold_str = str(threshold)

        if isinstance(gain, (float, int)):
            gain_str = f"{gain:.4f}"
        else:
            gain_str = str(gain)

        print(f"{prefix}├── if {feature} <= {threshold_str}  (Gain = {gain_str})")
        print_tree(node["left"], feature_names, depth + 1)
        print(f"{prefix}└── else  # {feature} > {threshold_str}")
        print_tree(node["right"], feature_names, depth + 1)


In [50]:
handmade_dt_model.fit(X_train, y_train)
print("Decision Tree Structure:")
print_tree(handmade_dt_model.tree, feature_names=X_train.columns.tolist())


Decision Tree Structure:
├── if duration <= 383.0000  (Gain = 0.1460)
│   ├── if contact <= telephone  (Gain = 0.0930)
│   │   ├── if duration <= 129.0000  (Gain = 0.1086)
│   │   │   ├── if duration <= 77.0000  (Gain = 0.0601)
│   │   │   │   ├── if duration <= 62.0000  (Gain = 0.0231)
│   │   │   │   │   ├── if age <= 23.0000  (Gain = 0.0109)
│   │   │   │   │   │   └── Leaf → class = 0  (samples = 3)
│   │   │   │   │   └── else  # age > 23.0000
│   │   │   │   │   │   ├── if day <= 2.0000  (Gain = 0.0107)
│   │   │   │   │   │   │   ├── if marital <= married  (Gain = 0.7219)
│   │   │   │   │   │   │   │   └── Leaf → class = 0  (samples = 4)
│   │   │   │   │   │   │   └── else  # marital > married
│   │   │   │   │   │   │   │   └── Leaf → class = 1  (samples = 1)
│   │   │   │   │   │   └── else  # day > 2.0000
│   │   │   │   │   │   │   ├── if contact <= cellular  (Gain = 0.0056)
│   │   │   │   │   │   │   │   └── Leaf → class = 0  (samples = 340)
│   │   │   │   │   │   │   └

## Using pre_defined library's functions

In [11]:
from sklearn.tree import DecisionTreeClassifier
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, NUMERICAL),
        ('cat', categorical_transformer, CATEGORICAL)
    ]
)
DT_model = DecisionTreeClassifier(random_state=random_state)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DT_model)
])

pipeline.fit(X_train, y_train)

In [12]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

          no     0.8120    0.8030    0.8075      1183
         yes     0.7808    0.7905    0.7856      1050

    accuracy                         0.7971      2233
   macro avg     0.7964    0.7968    0.7965      2233
weighted avg     0.7973    0.7971    0.7972      2233



## Random forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF_model = RandomForestClassifier(
    n_estimators=200,       # number of trees
    criterion='entropy',   
    max_depth=None,         
    random_state=random_state
)


In [60]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, NUMERICAL),
        ('cat', categorical_transformer, CATEGORICAL)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RF_model)
])


In [61]:
pipeline.fit(X_train, y_train)

In [62]:
y_pred = pipeline.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

          no       0.89      0.83      0.86      1183
         yes       0.83      0.89      0.86      1050

    accuracy                           0.86      2233
   macro avg       0.86      0.86      0.86      2233
weighted avg       0.86      0.86      0.86      2233

[[987 196]
 [117 933]]


In [63]:
import pandas as pd

rf_feature_importance = pipeline.named_steps['classifier'].feature_importances_

# ویژگی‌های عددی بعد از OneHotEncoder ممکن است زیاد شوند:
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': rf_feature_importance
}).sort_values(by='Importance', ascending=False)

print(importance_df.head(10))

                  Feature  Importance
5           num__duration    0.304926
3            num__balance    0.083497
2                num__age    0.077073
6                num__day    0.071661
0           num__campaign    0.038732
1              num__pdays    0.032759
35  cat__poutcome_success    0.029970
14   cat__contact_unknown    0.020877
4           num__previous    0.019495
50       cat__housing_yes    0.016393
