# Decision Tree and Random Forest Project

Notebook: trains and evaluates Decision Tree and Random Forest on the bank.csv dataset.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import joblib
sns.set(style="whitegrid")

## 2. Load Dataset

In [None]:
data_path = r"c:\\Users\\ThinkBook\\Desktop\\AI\\AI_2026\\Day05 Decision tree and Essemble method\\groupB\\hak_BankMarketing\\bank.csv"
df = pd.read_csv(data_path)
df.head()

In [None]:
df.info()
print("Missing values:
", df.isnull().sum())

## 3. Explore Data

In [None]:
# target distribution
print(df['deposit'].value_counts())
plt.figure(figsize=(5,3))
df['deposit'].value_counts().plot(kind='bar')
plt.title('Deposit distribution')
plt.show()

# quick pair of distributions
plt.figure(figsize=(10,4))
sns.histplot(df['age'], bins=30, kde=False)
plt.title('Age distribution')
plt.show()

## 4. Preprocess Data

In [None]:
# Encode target
le = LabelEncoder()
df['deposit_encoded'] = le.fit_transform(df['deposit'])

# categorical columns (exclude target)
cat_cols = df.select_dtypes(include='object').columns.tolist()
if 'deposit' in cat_cols:
    cat_cols.remove('deposit')

# one-hot encode categoricals
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# features and target
y = df_encoded['deposit_encoded']
X = df_encoded.drop(['deposit','deposit_encoded'], axis=1)

# scale numeric cols
num_cols = X.select_dtypes(include=['int64','float64']).columns
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

print('Feature matrix shape:', X.shape)
X.head()

## 5. Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train:', X_train.shape, 'Test:', X_test.shape)

## 6. Decision Tree: Train & Evaluate

In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print('Decision Tree Accuracy:', accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print('Confusion matrix:
', confusion_matrix(y_test, y_pred_dt))

## 7. Random Forest: Train & Evaluate (Default)

In [None]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print('Random Forest Accuracy:', accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print('Confusion matrix:
', confusion_matrix(y_test, y_pred_rf))

## 8. Hyperparameter Tuning (Random Forest)

In [None]:
param_grid = {
    'n_estimators': [50,100,200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2,5]
}
grid = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1), param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)
print('Best params:', grid.best_params_)
best_rf = grid.best_estimator_
print('Best CV score:', grid.best_score_)

# Evaluate tuned RF
y_pred_best = best_rf.predict(X_test)
print('Tuned RF Accuracy:', accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))

## 9. Visualizations: Feature Importance & ROC

In [None]:
# Feature importances
importances = best_rf.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False).head(20)
plt.figure(figsize=(8,6))
feat_imp.plot(kind='barh')
plt.title('Top 20 feature importances (Random Forest)')
plt.gca().invert_yaxis()
plt.show()

# ROC curve (for positive class)
if hasattr(best_rf, 'predict_proba'):
    y_score = best_rf.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    auc = roc_auc_score(y_test, y_score)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()

## 10. Save Models and Preprocessing Objects

In [None]:
joblib.dump(dt, r'c:\\Users\\ThinkBook\\Desktop\\AI\\AI_2026\\Day05 Decision tree and Essemble method\\groupB\\hak_BankMarketing\\bank_dt_model.joblib')
joblib.dump(best_rf, r'c:\\Users\\ThinkBook\\Desktop\\AI\\AI_2026\\Day05 Decision tree and Essemble method\\groupB\\hak_BankMarketing\\bank_rf_model.joblib')
joblib.dump(le, r'c:\\Users\\ThinkBook\\Desktop\\AI\\AI_2026\\Day05 Decision tree and Essemble method\\groupB\\hak_BankMarketing\\label_encoder.joblib')
joblib.dump(scaler, r'c:\\Users\\ThinkBook\\Desktop\\AI\\AI_2026\\Day05 Decision tree and Essemble method\\groupB\\hak_BankMarketing\\scaler.joblib')
print('Models and preprocessing objects saved.')

---

Run this notebook by executing the cells sequentially. Adjust parameter grids or add visualizations as needed.