In [None]:
import ml_utils as mt 
import pandas as pd 
import numpy as np
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor,plot_tree
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,ExtraTreesClassifier,ExtraTreesRegressor
from sklearn.inspection import PartialDependenceDisplay,permutation_importance
import matplotlib.pyplot as plt 
from sklearn.model_selection import RandomizedSearchCV

# Regression Trees

In [None]:
ld_train=pd.read_csv('./loan_data_train.csv')
def custom_dir(dir_col):
    
    temp=dir_col.str.replace('%','')
    num=pd.to_numeric(temp,errors='coerce')
    
    return num

def custom_fico(fico_col):
    
    temp=fico_col.str.split('-',expand=True)
    for i in [0,1]:
        temp[i]=pd.to_numeric(temp[i],errors='coerce')
    num=0.5*(temp[0]+temp[1])
    
    return num

def custom_el(el_col):
    temp=el_col.replace({'5 years':5, '4 years':4, '< 1 year':0, 
           '10+ years':10, '2 years':2,'8 years':8, 
           '1 year':1, '6 years':6, '7 years':7, 
           '3 years':3, '9 years':9})
    num=pd.to_numeric(temp,errors='coerce')
    return num

cat_to_num_cols=['Amount.Requested',
                'Open.CREDIT.Lines','Revolving.CREDIT.Balance']
simple_num_cols=['Monthly.Income','Inquiries.in.the.Last.6.Months']
cat_to_dummies_cols=[ 'Loan.Length','Loan.Purpose',
                    'State','Home.Ownership']
custom_func_cols={'Debt.To.Income.Ratio':custom_dir,
                    'FICO.Range':custom_fico,
                    'Employment.Length':custom_el}
ld_pipe=mt.DataPipe(simple_num=simple_num_cols,
                   cat_to_num=cat_to_num_cols,
                   cat_to_dummies=cat_to_dummies_cols,
                   custom_func_dict=custom_func_cols)
ld_pipe.fit(ld_train)
x_train=ld_pipe.transform(ld_train)
y_train=ld_train['Interest.Rate'].str.replace('%','').astype(float)

In [None]:
# None here implies that there is no explicit constraint on max_depth 
# and it will be controlled by other parameters
params={
        'max_depth':[None,5,6,7,8,10,15,20,30,50], 
        'min_samples_leaf':[1,2,5,10,15,20], 
        'min_samples_split':[2,5,10,15,20] 
        }

In [None]:
reg=DecisionTreeRegressor()

In [None]:
rs=RandomizedSearchCV(reg,
                     cv=10,
                     param_distributions=params,
                     scoring='neg_mean_absolute_error',
                     n_iter=50,
                     n_jobs=-1)

In [None]:
rs.fit(x_train,y_train)

In [None]:
mt.report(rs.cv_results_,5)

In [None]:
# I am using a particular set of parameter values here which i obtained as best 
# its possible that you get some other candidates as best due to data change , random seed change etc 
# replace these with the values which you got as best 
dt=DecisionTreeRegressor(**{'min_samples_split': 5, 'min_samples_leaf': 15, 'max_depth': 8})

In [None]:
dt.fit(x_train,y_train)

In [None]:
# for this to work , you will also need to have graphviz installed on your machine
# visit : https://graphviz.org
from sklearn.tree import export_graphviz
import graphviz

# Generate DOT data
dot_data = export_graphviz(
    dt,
    out_file=None,  # No file output, just return as a string
    feature_names=x_train.columns,
    class_names=["0", "1"],
    filled=True,
    rounded=True,
    special_characters=True
)

# Render with Graphviz
graph = graphviz.Source(dot_data)
graph.render("decision_tree", format="png", cleanup=True)  # Saves as a high-quality PNG
graph  # Display in Jupyter Notebook (if applicable)

In [None]:
# this exports decision tree [in sequence of rules format]
from sklearn.tree import export_text

tree_rules = export_text(dt, feature_names=x_train.columns)
print(tree_rules)

# Partial Dependence Plots

In [None]:
feature_to_plot = ['FICO.Range']  
fig, ax = plt.subplots(figsize=(10, 6))
PartialDependenceDisplay.from_estimator(dt, x_train, feature_to_plot, ax=ax,kind='average')
plt.show()

In [None]:
features = [('Amount.Requested', 'FICO.Range')]  

fig, ax = plt.subplots(figsize=(12, 8))
PartialDependenceDisplay.from_estimator(dt, x_train, features, ax=ax, kind="average")
plt.show()

# Regression Forest

In [None]:
params_rf={
        'n_estimators':[50,100,150,200,250,300], 
        'max_features':[2,5,10,15,20,30,40,48],  
        'bootstrap': [True,False], # 2
        'max_depth':[None,5,6,7,8,10,15,20,30,50], 
        'min_samples_leaf':[1,2,5,10,15,20], 
        'min_samples_split':[2,5,10,15,20] 
}

In [None]:
reg_rf=RandomForestRegressor()

In [None]:
rs_rf=RandomizedSearchCV(reg_rf,
                        cv=10,
                        param_distributions=params_rf,
                        scoring='neg_mean_absolute_error',
                        n_iter=50,
                        n_jobs=-1)

In [None]:
rs_rf.fit(x_train,y_train)

In [None]:
mt.report(rs_rf.cv_results_,5)

In [None]:
# I am using a particular set of parameter values here which i obtained as best 
# its possible that you get some other candidates as best due to data change , random seed change etc 
# replace these with the values which you got as best
reg_rf=RandomForestRegressor(**{'n_estimators': 200, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_features': 40, 'max_depth': 8, 'bootstrap': True})

In [None]:
reg_rf.fit(x_train,y_train)

# Feature Importance

In [None]:
feat_imp_df=pd.DataFrame({'Features':x_train.columns,'Importance':reg_rf.feature_importances_})

In [None]:
# you can use cumulative importance to discard features 
# once you have reached to a certain level of cumulative importance 
feat_imp_df.sort_values(by='Importance',ascending=False,inplace=True)
feat_imp_df['Cum_Importance']=feat_imp_df['Importance'].cumsum()
feat_imp_df

# Permutation Importance

In [None]:
# you can use some new datasets also for this doesnt have to be x_train,y_train
perm_importance = permutation_importance(reg_rf, x_train, y_train, scoring='neg_mean_absolute_error', n_repeats=10, random_state=42)

In [None]:
# note that this is just mean change in scoring metrics and these values are not normalised by default over features 
# that we will have to do explicitly before we go ahead and look at it as cumulative importance 
perm_df = pd.DataFrame({'Feature': x_train.columns, 'Importance': perm_importance.importances_mean})
perm_df['Importance']=perm_df['Importance']/perm_df['Importance'].sum()
perm_df.sort_values(by="Importance", ascending=False,inplace=True)
perm_df['Cum_Importance']=perm_df['Importance'].cumsum()
perm_df

# Partial Dependence Plots

Note that how same dependence plots are much more smoother after we extract dependence by averaging over many trees in random forest

In [None]:
feature_to_plot = ['FICO.Range']  
fig, ax = plt.subplots(figsize=(10, 6))
PartialDependenceDisplay.from_estimator(reg_rf, x_train, feature_to_plot, ax=ax,kind='average')
plt.show()

In [None]:
features = [('Amount.Requested', 'FICO.Range')] 

fig, ax = plt.subplots(figsize=(12, 8))
PartialDependenceDisplay.from_estimator(rs_rf, x_train, features, ax=ax, kind="average")
plt.show()


# Classification Data Prep

In [None]:
bd_train=pd.read_csv(r'./bd_train.csv')

def children_to_num(col):
    
    num_col=col.str.replace('Zero','0')
    num_col=num_col.str.replace('4+','4',regex=False)
    num_col=pd.to_numeric(num_col,errors='coerce')
    
    return num_col

def ab_to_num(col):
    
    col=col.str.replace('71+','71-71',regex=False)
    k=col.str.split('-',expand=True)
    
    for i in [0,1]:
        k[i]=pd.to_numeric(k[i],errors='coerce')
        
    num_col=0.5*(k[0]+k[1])
    
    return num_col

def fi_to_num(col):
    
    col=col.replace({'<10,000, >= 8,000':9000, '>=35,000':35000, '<25,000, >=22,500':23750,
       '<20,000, >=17,500':18750, '<12,500, >=10,000':11250, '<30,000, >=27,500':28750,
       '<27,500, >=25,000':26250, '<17,500, >=15,000':16250, '<15,000, >=12,500':13750,
       '<22,500, >=20,000':21250,'< 4,000': 4000, '< 8,000, >= 4,000':6000})
    num_col=pd.to_numeric(col,errors='coerce')
    
    return num_col

simple_numeric_cols=['year_last_moved','Average.Credit.Card.Transaction', 'Balance.Transfer',
      'Term.Deposit', 'Life.Insurance', 'Medical.Insurance',
      'Average.A.C.Balance', 'Personal.Loan', 'Investment.in.Mutual.Fund',
      'Investment.Tax.Saving.Bond', 'Home.Loan', 'Online.Purchase.Amount','Investment.in.Commudity',
      'Investment.in.Equity', 'Investment.in.Derivative',
      'Portfolio.Balance']

cat_to_dummies_cols=['status' , 'occupation' , 'occupation_partner' , 'home_status', 'self_employed',
'self_employed_partner','TVarea','gender','region']

custom_function_cols={'children':children_to_num,'age_band':ab_to_num,'family_income':fi_to_num}

bd_pipe=mt.DataPipe(simple_num=simple_numeric_cols,
                     cat_to_dummies=cat_to_dummies_cols,
                     custom_func_dict=custom_function_cols)
bd_pipe.fit(bd_train)
x_train=bd_pipe.transform(bd_train)
y_train=(bd_train['Revenue.Grid']==1).astype(int)

# Decision Trees

In [None]:
params_dt={
    'max_depth':[None,5,10,15,20,30,50,70],
    'min_samples_leaf':[1,2,5,10,15,20],
    'min_samples_split':[2,5,10,15,20],
    # these two hyper parameters are not there for regression models
    'class_weight':[None,'balanced'],
    'criterion':['entropy','gini']
}

In [None]:
clf_dt=DecisionTreeClassifier()

In [None]:
rs_dt=RandomizedSearchCV(clf_dt,
                        cv=10,
                        param_distributions=params_dt,
                        scoring='roc_auc',
                        n_iter=10,
                        n_jobs=-1)

In [None]:
rs_dt.fit(x_train,y_train)

In [None]:
mt.report(rs_dt.cv_results_,5)

In [None]:
# displaying the tree as figure or as text , works the same as it did for regression tree , i am not repeating this here
# feature importance , permuation importance [scoring = roc_auc ] , partial dependence plots [predict probability] work 
# syntactically the same way as they did for regression trees 
# if you struggle please raise an issue with details

# finding threshold for predicted probabilities also works the same way it did for linear models , please go back to that section
# if you dont seem to recall

# Hyperparameters for extraTree are exactly the same, just the internal working is different 
# Do add codes for extratrees here and see how performance, extracted patterns etc compare with dtrees and rfs

# Random Forest

In [None]:
clf_rf=RandomForestClassifier()

In [None]:
x_train.shape

In [None]:
params_rf={
    'n_estimators':[100,200,300,500,700,1000],
    'max_features':[5,10,15,20,25,30,35,40,45,50,55,60,70], # this should not exceed number of features in the data
    'bootstrap':[True,False],
    'max_depth':[None,5,10,15,20,30,50,70],
    'min_samples_leaf':[1,2,5,10,15,20],
    'min_samples_split':[2,5,10,15,20],
    'class_weight':[None,'balanced'],
    'criterion':['entropy','gini']
}

In [None]:
rs_rf=RandomizedSearchCV(
                        clf_rf,
                        cv=10,
                        param_distributions=params_rf,
                        scoring='roc_auc',
                        n_iter=10,
                        n_jobs=-1
                        )

In [None]:
rs_rf.fit(x_train,y_train)

In [None]:
mt.report(rs_rf.cv_results_,5)